def calc_bin(self, _bin=None): if _bin is None: try: _bin = bins.bins(self.start, self.end, one=True) except TypeError: _bin = None return _bin
def calc_bin(self, _bin=None): """ Calculate the smallest UCSC genomic bin that will contain this feature. """ if _bin is None: try: _bin = bins.bins(self.start, self.end, one=True) except TypeError: _bin = None return _bin
def _bin_from_dict(d): """ Given a dictionary yielded by the parser, return the genomic "UCSC" bin """ try: start = int(d['start']) end = int(d['end']) return bins.bins(start, end, one=True) # e.g., if "." except ValueError: return None
def interfeatures(self, features, new_featuretype=None, merge_attributes=True, dialect=None, attribute_func=None, update_attributes=None): """ Construct new features representing the space between features. For example, if `features` is a list of exons, then this method will return the introns. If `features` is a list of genes, then this method will return the intergenic regions. Providing N features will return N - 1 new features. This method purposefully does *not* do any merging or sorting of coordinates, so you may want to use :meth:`FeatureDB.merge` first, or when selecting features use the `order_by` kwarg, e.g., `db.features_of_type('gene', order_by=('seqid', 'start'))`. Parameters ---------- features : iterable of :class:`feature.Feature` instances Sorted, merged iterable new_featuretype : string or None The new features will all be of this type, or, if None (default) then the featuretypes will be constructed from the neighboring features, e.g., `inter_exon_exon`. merge_attributes : bool If True, new features' attributes will be a merge of the neighboring features' attributes. This is useful if you have provided a list of exons; the introns will then retain the transcript and/or gene parents as a single item. Otherwise, if False, the attribute will be a comma-separated list of values, potentially listing the same gene ID twice. attribute_func : callable or None If None, then nothing special is done to the attributes. If callable, then the callable accepts two attribute dictionaries and returns a single attribute dictionary. If `merge_attributes` is True, then `attribute_func` is called before `merge_attributes`. This could be useful for manually managing IDs for the new features. update_attributes : dict After attributes have been modified and merged, this dictionary can be used to replace parts of the attributes dictionary. Returns ------- A generator that yields :class:`Feature` objects """ for i, f in enumerate(features): # no inter-feature for the first one if i == 0: interfeature_start = f.stop last_feature = f continue interfeature_stop = f.start if new_featuretype is None: new_featuretype = 'inter_%s_%s' % ( last_feature.featuretype, f.featuretype) if last_feature.strand != f.strand: new_strand = '.' else: new_strand = f.strand if last_feature.chrom != f.chrom: # We've moved to a new chromosome. For example, if we're # getting intergenic regions from all genes, they will be on # different chromosomes. We still assume sorted features, but # don't complain if they're on different chromosomes -- just # move on. last_feature = f continue strand = new_strand chrom = last_feature.chrom # Shrink interfeature_start += 1 interfeature_stop -= 1 if merge_attributes: new_attributes = helpers.merge_attributes( last_feature.attributes, f.attributes) else: new_attributes = {} if update_attributes: new_attributes.update(update_attributes) new_bin = bins.bins( interfeature_start, interfeature_stop, one=True) _id = None fields = dict( seqid=chrom, source='gffutils_derived', featuretype=new_featuretype, start=interfeature_start, end=interfeature_stop, score='.', strand=strand, frame='.', attributes=new_attributes, bin=new_bin) if dialect is None: # Support for @classmethod -- if calling from the class, then # self.dialect is not defined, so defer to Feature's default # (which will be constants.dialect, or GFF3). try: dialect = self.dialect except AttributeError: dialect = None yield self._feature_returner(**fields) interfeature_start = f.stop
def region(self, region=None, seqid=None, start=None, end=None, strand=None, featuretype=None, completely_within=False): """ Return features within specified genomic coordinates. Specifying genomic coordinates can be done in a flexible manner Parameters ---------- region : string, tuple, or Feature instance If string, then of the form "seqid:start-end". If tuple, then (seqid, start, end). If :class:`Feature`, then use the features seqid, start, and end values. This argument is mutually exclusive with start/end/seqid. *Note*: By design, even if a feature is provided, its strand will be ignored. If you want to restrict the output by strand, use the separate `strand` kwarg. strand : + | - | . | None If `strand` is provided, then only those features exactly matching `strand` will be returned. So `strand='.'` will only return unstranded features. Default is `strand=None` which does not restrict by strand. seqid, start, end, strand Mutually exclusive with `region`. These kwargs can be used to approximate slice notation; see "Details" section below. featuretype : None, string, or iterable If not None, then restrict output. If string, then only report that feature type. If iterable, then report all featuretypes in the iterable. completely_within : bool By default (`completely_within=False`), returns features that partially or completely overlap `region`. If `completely_within=True`, features that are completely within `region` will be returned. Notes ------- The meaning of `seqid`, `start`, and `end` is interpreted as follows: ====== ====== ===== ====================================== seqid start end meaning ====== ====== ===== ====================================== str int int equivalent to `region` kwarg None int int features from all chroms within coords str None int equivalent to [:end] slice notation str int None equivalent to [start:] slice notation None None None equivalent to FeatureDB.all_features() ====== ====== ===== ====================================== If performance is a concern, use `completely_within=True`. This allows the query to be optimized by only looking for features that fall in the precise genomic bin (same strategy as UCSC Genome Browser and BEDTools). Otherwise all features' start/stop coords need to be searched to see if they partially overlap the region of interest. Examples -------- - `region(seqid="chr1", start=1000)` returns all features on chr1 that start or extend past position 1000 - `region(seqid="chr1", start=1000, completely_within=True)` returns all features on chr1 that start past position 1000. - `region("chr1:1-100", strand="+", completely_within=True)` returns only plus-strand features that completely fall within positions 1 to 100 on chr1. Returns ------- A generator object that yields :class:`Feature` objects. """ # Argument handling. if region is not None: if (seqid is not None) or (start is not None) or (end is not None): raise ValueError( "If region is supplied, do not supply seqid, " "start, or end as separate kwargs") if isinstance(region, six.string_types): toks = region.split(':') if len(toks) == 1: seqid = toks[0] start, end = None, None else: seqid, coords = toks[:2] if len(toks) == 3: strand = toks[2] start, end = coords.split('-') elif isinstance(region, Feature): seqid = region.seqid start = region.start end = region.end strand = region.strand # otherwise assume it's a tuple else: seqid, start, end = region[:3] # e.g., # completely_within=True..... start >= {start} AND end <= {end} # completely_within=False.... start < {end} AND end > {start} if completely_within: start_op = '>=' end_op = '<=' else: start_op = '<' end_op = '>' end, start = start, end args = [] position_clause = [] if seqid is not None: position_clause.append('seqid = ?') args.append(seqid) if start is not None: start = int(start) position_clause.append('start %s ?' % start_op) args.append(start) if end is not None: end = int(end) position_clause.append('end %s ?' % end_op) args.append(end) position_clause = ' AND '.join(position_clause) # Only use bins if we have defined boundaries and completely_within is # True. Otherwise you can't know how far away a feature stretches # (which means bins are not computable ahead of time) _bin_clause = '' if (start is not None) and (end is not None) and completely_within: if start <= bins.MAX_CHROM_SIZE and end <= bins.MAX_CHROM_SIZE: _bins = list(bins.bins(start, end, one=False)) # See issue #45 if len(_bins) < 900: _bin_clause = ' or ' .join(['bin = ?' for _ in _bins]) _bin_clause = 'AND ( %s )' % _bin_clause args += _bins query = ' '.join([ constants._SELECT, 'WHERE ', position_clause, _bin_clause]) # Add the featuretype clause if featuretype is not None: if isinstance(featuretype, six.string_types): featuretype = [featuretype] feature_clause = ' or '.join( ['featuretype = ?' for _ in featuretype]) query += ' AND (%s) ' % feature_clause args.extend(featuretype) if strand is not None: strand_clause = ' and strand = ? ' query += strand_clause args.append(strand) c = self.conn.cursor() self._last_query = query self._last_args = args self._context = { 'start': start, 'end': end, 'seqid': seqid, 'region': region, } c.execute(query, tuple(args)) for i in c: yield self._feature_returner(**i)
def _update_relations(self): if not self.infer_gene_extent: return # TODO: do any indexes speed this up? c = self.conn.cursor() c2 = self.conn.cursor() logger.info("Creating relations(parent) index") c.execute('DROP INDEX IF EXISTS relationsparent') c.execute('CREATE INDEX relationsparent ON relations (parent)') logger.info("Creating relations(child) index") c.execute('DROP INDEX IF EXISTS relationschild') c.execute('CREATE INDEX relationschild ON relations (child)') logger.info('Inferring gene and transcript extents, ' 'and writing to tempfile') tmp = tempfile.NamedTemporaryFile(delete=False).name tmp = '/tmp/gffutils' fout = open(tmp, 'w') self._tmpfile = tmp # This takes some explanation... # # First, the nested subquery gets the level-1 parents of # self.subfeature featuretypes. For an on-spec GTF file, # self.subfeature = "exon". So this subquery translates to getting the # distinct level-1 parents of exons -- which are transcripts. # # OK, so this first subquery is now a list of transcripts; call it # "firstlevel". # # Then join firstlevel on relations, but the trick is to now consider # each transcript a *child* -- so that relations.parent (on the first # line of the query) will be the first-level parent of the transcript # (the gene). # # # The result is something like: # # transcript1 gene1 # transcript2 gene1 # transcript3 gene2 # # Note that genes are repeated; below we need to ensure that only one # is added. To ensure this, the results are ordered by the gene ID. c.execute( ''' SELECT DISTINCT firstlevel.parent, relations.parent FROM ( SELECT DISTINCT parent FROM relations JOIN features ON features.id = relations.child WHERE features.featuretype = ? AND relations.level = 1 ) AS firstlevel JOIN relations ON firstlevel.parent = child WHERE relations.level = 1 ORDER BY relations.parent ''', (self.subfeature,)) # Now we iterate through those results (using a new cursor) to infer # the extent of transcripts and genes. last_gene_id = None n_features = 0 for transcript_id, gene_id in c: # transcript extent c2.execute( ''' SELECT MIN(start), MAX(end), strand, seqid FROM features JOIN relations ON features.id = relations.child WHERE parent = ? AND featuretype == ? ''', (transcript_id, self.subfeature)) transcript_start, transcript_end, strand, seqid = c2.fetchone() transcript_attributes = { self.transcript_key: [transcript_id], self.gene_key: [gene_id] } transcript_bin = bins.bins( transcript_start, transcript_end, one=True) # Write out to file; we'll be reading it back in shortly. Omit # score, frame, source, and extra since they will always have the # same default values (".", ".", "gffutils_derived", and [] # respectively) fout.write('\t'.join(map(str, [ transcript_id, seqid, transcript_start, transcript_end, strand, 'transcript', transcript_bin, helpers._jsonify(transcript_attributes) ])) + '\n') n_features += 1 # Infer gene extent, but only if we haven't done so already. if gene_id != last_gene_id: c2.execute( ''' SELECT MIN(start), MAX(end), strand, seqid FROM features JOIN relations ON features.id = relations.child WHERE parent = ? AND featuretype == ? ''', (gene_id, self.subfeature)) gene_start, gene_end, strand, seqid = c2.fetchone() gene_attributes = {self.gene_key: [gene_id]} gene_bin = bins.bins(gene_start, gene_end, one=True) fout.write('\t'.join(map(str, [ gene_id, seqid, gene_start, gene_end, strand, 'gene', gene_bin, helpers._jsonify(gene_attributes) ])) + '\n') last_gene_id = gene_id n_features += 1 fout.close() def derived_feature_generator(): """ Generator of items from the file that was just created... """ keys = ['parent', 'seqid', 'start', 'end', 'strand', 'featuretype', 'bin', 'attributes'] for line in open(fout.name): d = dict(list(zip(keys, line.strip().split('\t')))) d.pop('parent') d['score'] = '.' d['source'] = 'gffutils_derived' d['frame'] = '.' d['extra'] = [] d['attributes'] = helpers._unjsonify(d['attributes']) f = feature.Feature(**d) f.id = self._id_handler(f) yield f # Drop the indexes so the inserts are faster c.execute('DROP INDEX IF EXISTS relationsparent') c.execute('DROP INDEX IF EXISTS relationschild') # Insert the just-inferred transcripts and genes. TODO: should we # *always* use "merge" here for the merge_strategy? logger.info("Importing inferred features into db") last_perc = None for i, f in enumerate(derived_feature_generator()): perc = int(i / float(n_features) * 100) if perc != last_perc: sys.stderr.write('%s of %s (%s%%)\r' % (i, n_features, perc)) sys.stderr.flush() last_perc = perc try: self._insert(f, c) except sqlite3.IntegrityError: fixed, final_strategy = self._do_merge(f, 'merge') c.execute( ''' UPDATE features SET attributes = ? WHERE id = ? ''', (helpers._jsonify(fixed.attributes), fixed.id)) logger.info("Committing changes") self.conn.commit() os.unlink(fout.name)
def make_query(args, other=None, limit=None, strand=None, featuretype=None, extra=None, order_by=None, reverse=False, completely_within=False): """ Multi-purpose, bare-bones ORM function. This function composes queries given some commonly-used kwargs that can be passed to FeatureDB methods (like .parents(), .children(), .all_features(), .features_of_type()). It handles, in one place, things like restricting to featuretype, limiting to a genomic range, limiting to one strand, or returning results ordered by different criteria. Additional filtering/subsetting/sorting behavior should be added here. (Note: this ended up having better performance (and flexibility) than sqlalchemy) This function also provides support for additional JOINs etc (supplied via the `other` kwarg) and extra conditional clauses (`extra` kwarg). See the `_QUERY` var below for the order in which they are used. For example, FeatureDB._relation uses `other` to supply the JOIN substatment, and that same method also uses `extra` to supply the "relations.level = ?" substatment (see the source for FeatureDB._relation for more details). `args` contains the arguments that will ultimately be supplied to the sqlite3.connection.execute function. It may be further populated below -- for example, if strand="+", then the query will include a strand clause, and the strand will be appended to the args. `args` can be pre-filled with args that are passed to `other` and `extra`. """ _QUERY = ("{_SELECT} {OTHER} {EXTRA} {FEATURETYPE} " "{LIMIT} {STRAND} {ORDER_BY}") # Construct a dictionary `d` that will be used later as _QUERY.format(**d). # Default is just _SELECT, which returns all records in the features table. # (Recall that constants._SELECT gets the fields in the order needed to # reconstruct a Feature) d = dict(_SELECT=constants._SELECT, OTHER="", FEATURETYPE="", LIMIT="", STRAND="", ORDER_BY="", EXTRA="") if other: d['OTHER'] = other if extra: d['EXTRA'] = extra # If `other` and `extra` take args (that is, they have "?" in them), then # they should have been provided in `args`. required_args = (d['EXTRA'] + d['OTHER']).count('?') if len(args) != required_args: raise ValueError('Not enough args (%s) for subquery' % args) # Below, if a kwarg is specified, then we create sections of the query -- # appending to args as necessary. # # IMPORTANT: the order in which things are processed here is the same as # the order of the placeholders in _QUERY. That is, we need to build the # args in parallel with the query to avoid putting the wrong args in the # wrong place. if featuretype: # Handle single or iterables of featuretypes. # # e.g., "featuretype = 'exon'" # # or, "featuretype IN ('exon', 'CDS')" if isinstance(featuretype, six.string_types): d['FEATURETYPE'] = "features.featuretype = ?" args.append(featuretype) else: d['FEATURETYPE'] = ("features.featuretype IN (%s)" % (','.join(["?" for _ in featuretype]))) args.extend(featuretype) if limit: # Restrict to a genomic region. Makes use of the UCSC binning strategy # for performance. # # `limit` is a string or a tuple of (chrom, start, stop) # # e.g., "seqid = 'chr2L' AND start > 1000 AND end < 5000" if isinstance(limit, six.string_types): seqid, startstop = limit.split(':') start, end = startstop.split('-') else: seqid, start, end = limit # Identify possible bins _bins = bins.bins(int(start), int(end), one=False) # Use different overlap conditions if completely_within: d['LIMIT'] = ("features.seqid = ? AND features.start >= ? " "AND features.end <= ?") args.extend([seqid, start, end]) else: d['LIMIT'] = ("features.seqid = ? AND features.start <= ? " "AND features.end >= ?") # Note order (end, start) args.extend([seqid, end, start]) # Add bin clause. See issue #45. if len(_bins) < 900: d['LIMIT'] += " AND features.bin IN (%s)" % (','.join( map(str, _bins))) if strand: # e.g., "strand = '+'" d['STRAND'] = "features.strand = ?" args.append(strand) # TODO: implement file_order! valid_order_by = constants._gffkeys_extra + ['file_order', 'length'] _order_by = [] if order_by: # Default is essentially random order. # # e.g. "ORDER BY seqid, start DESC" if isinstance(order_by, six.string_types): _order_by.append(order_by) else: for k in order_by: if k not in valid_order_by: raise ValueError("%s not a valid order-by value in %s" % (k, valid_order_by)) # There's no length field, so order by end - start if k == 'length': k = '(end - start)' _order_by.append(k) _order_by = ','.join(_order_by) if reverse: direction = 'DESC' else: direction = 'ASC' d['ORDER_BY'] = 'ORDER BY %s %s' % (_order_by, direction) # Ensure only one "WHERE" is included; the rest get "AND ". This is ugly. where = False if "where" in d['OTHER'].lower(): where = True for i in ['EXTRA', 'FEATURETYPE', 'LIMIT', 'STRAND']: if d[i]: if not where: d[i] = "WHERE " + d[i] where = True else: d[i] = "AND " + d[i] return _QUERY.format(**d), args
def region(self, region, featuretype=None, completely_within=False): """ Return features with any part overlapping `region`. Parameters ---------- region : string, tuple, or Feature instance If string, then of the form "seqid:start-end". If tuple, then (seqid, start, end). If :class:`Feature`, then use the features seqid, start, and end values. featuretype : None, string, or iterable If not None, then restrict output. If string, then only report that feature type. If iterable, then report all featuretypes in the iterable. completely_within : bool If False (default), returns features that overlap `region`, even partially. If True, only return features that are completely within `region`. """ strand = None if isinstance(region, six.string_types): toks = region.split(':') seqid, coords = toks[:2] if len(toks) == 3: strand = toks[2] start, end = coords.split('-') elif isinstance(region, Feature): seqid = region.seqid start = region.start end = region.end strand = region.strand else: seqid, start, end = region[:3] if len(region) == 4: strand = region[3] # Get a list of all possible bins for this region _bins = list(bins.bins(int(start), int(end), one=False)) if completely_within: position_clause = 'start >= ? AND end <= ?' args = [seqid, start, end] else: position_clause = 'start < ? AND end > ?' # note start/end swap args = [seqid, end, start] args += _bins _bin_clause = ' or ' .join(['bin = ?' for _ in _bins]) query = ' '.join([ constants._SELECT, 'WHERE seqid = ? AND', position_clause, 'AND', '(', _bin_clause, ')']) # Add the featuretype clause if featuretype is not None: if isinstance(featuretype, six.string_types): featuretype = [featuretype] feature_clause = ' or '.join( ['featuretype = ?' for _ in featuretype]) query += ' AND (%s) ' % feature_clause args.extend(featuretype) if strand is not None: strand_clause = ' and strand = ? ' query += strand_clause args.append(strand) c = self.conn.cursor() c.execute(query, tuple(args)) for i in c: yield self._feature_returner(**i)
def interfeatures(self, features, new_featuretype=None, merge_attributes=True, dialect=None): """ Construct new features representing the space between features. For example, if `features` is a list of exons, then this method will return the introns. If `features` is a list of genes, then this method will return the intergenic regions. Providing N features will return N - 1 new features. This method purposefully does *not* do any merging or sorting of coordinates, so you may want to use :meth:`FeatureDB.merge` first. The new features' attributes will be a merge of the neighboring features' attributes. This is useful if you have provided a list of exons; the introns will then retain the transcript and/or gene parents. Parameters ---------- features : iterable of :class:`feature.Feature` instances Sorted, merged iterable new_featuretype : string or None The new features will all be of this type, or, if None (default) then the featuretypes will be constructed from the neighboring features, e.g., `inter_exon_exon`. attribute_func : callable or None If None, then nothing special is done to the attributes. If callable, then the callable accepts two attribute dictionaries and returns a single attribute dictionary. If `merge_attributes` is True, then `attribute_func` is called before `merge_attributes`. This could be useful for manually managing IDs for the new features. """ for i, f in enumerate(features): # no inter-feature for the first one if i == 0: interfeature_start = f.stop last_feature = f continue interfeature_stop = f.start if new_featuretype is None: new_featuretype = 'inter_%s_%s' % ( last_feature.featuretype, f.featuretype) assert last_feature.strand == f.strand assert last_feature.chrom == f.chrom strand = last_feature.strand chrom = last_feature.chrom # Shrink interfeature_start += 1 interfeature_stop -= 1 new_attributes = helpers.merge_attributes( last_feature.attributes, f.attributes) new_bin = bins.bins( interfeature_start, interfeature_stop, one=True) _id = None fields = dict( seqid=chrom, source='gffutils_derived', featuretype=new_featuretype, start=interfeature_start, end=interfeature_stop, score='.', strand=strand, frame='.', attributes=new_attributes, bin=new_bin) if dialect is None: # Support for @classmethod -- if calling from the class, then # self.dialect is not defined, so defer to Feature's default # (which will be constants.dialect, or GFF3). try: dialect = self.dialect except AttributeError: dialect = None yield self._feature_returner(**fields) interfeature_start = f.stop
def make_query(args, other=None, limit=None, strand=None, featuretype=None, extra=None, order_by=None, reverse=False, completely_within=False): """ This function composes queries given some commonly-used kwargs that can be passed to FeatureDB methods (like .parents(), .children(), .all_features(), .features_of_type()). It handles, in one place, things like restricting to featuretype, limiting to a genomic range, limiting to one strand, or returning results ordered by different criteria. Additional filtering/subsetting/sorting behavior should be added here. (Note: this ended up having better performance (and flexibility) than sqlalchemy) This function also provides support for additional JOINs etc (supplied via the `other` kwarg) and extra conditional clauses (`extra` kwarg). See the `_QUERY` var below for the order in which they are used. For example, FeatureDB._relation uses `other` to supply the JOIN substatment, and that same method also uses `extra` to supply the "relations.level = ?" substatment (see the source for FeatureDB._relation for more details). `args` contains the arguments that will ultimately be supplied to the sqlite3.connection.execute function. It may be further populated below -- for example, if strand="+", then the query will include a strand clause, and the strand will be appended to the args. `args` can be pre-filled with args that are passed to `other` and `extra`. """ _QUERY = ("{_SELECT} {OTHER} {EXTRA} {FEATURETYPE} " "{LIMIT} {STRAND} {ORDER_BY}") # Construct a dictionary `d` that will be used later as _QUERY.format(**d). # Default is just _SELECT, which returns all records in the features table. # (Recall that constants._SELECT gets the fields in the order needed to # reconstruct a Feature) d = dict(_SELECT=constants._SELECT, OTHER="", FEATURETYPE="", LIMIT="", STRAND="", ORDER_BY="", EXTRA="") if other: d['OTHER'] = other if extra: d['EXTRA'] = extra # If `other` and `extra` take args (that is, they have "?" in them), then # they should have been provided in `args`. required_args = (d['EXTRA'] + d['OTHER']).count('?') if len(args) != required_args: raise ValueError('Not enough args (%s) for subquery' % args) # Below, if a kwarg is specified, then we create sections of the query -- # appending to args as necessary. # # IMPORTANT: the order in which things are processed here is the same as # the order of the placeholders in _QUERY. That is, we need to build the # args in parallel with the query to avoid putting the wrong args in the # wrong place. if featuretype: # Handle single or iterables of featuretypes. # # e.g., "featuretype = 'exon'" # # or, "featuretype IN ('exon', 'CDS')" if isinstance(featuretype, six.string_types): d['FEATURETYPE'] = "features.featuretype = ?" args.append(featuretype) else: d['FEATURETYPE'] = ( "features.featuretype IN (%s)" % (','.join(["?" for _ in featuretype])) ) args.extend(featuretype) if limit: # Restrict to a genomic region. Makes use of the UCSC binning strategy # for performance. # # `limit` is a string or a tuple of (chrom, start, stop) # # e.g., "seqid = 'chr2L' AND start > 1000 AND end < 5000" if isinstance(limit, six.string_types): seqid, startstop = limit.split(':') start, end = startstop.split('-') else: seqid, start, end = limit # Identify possible bins _bins = bins.bins(int(start), int(end), one=False) # Use different overlap conditions if completely_within: d['LIMIT'] = ( "features.seqid = ? AND features.start >= ? " "AND features.end <= ?" ) args.extend([seqid, start, end]) else: d['LIMIT'] = ( "features.seqid = ? AND features.start <= ? " "AND features.end >= ?" ) # Note order (end, start) args.extend([seqid, end, start]) # Add bin clause d['LIMIT'] += " AND features.bin IN (%s)" % (','.join(map(str, _bins))) if strand: # e.g., "strand = '+'" d['STRAND'] = "features.strand = ?" args.append(strand) # TODO: implement file_order! valid_order_by = constants._gffkeys_extra + ['file_order', 'length'] _order_by = [] if order_by: # Default is essentially random order. # # e.g. "ORDER BY seqid, start DESC" if isinstance(order_by, six.string_types): _order_by.append(order_by) else: for k in order_by: if k not in valid_order_by: raise ValueError("%s not a valid order-by value in %s" % (k, valid_order_by)) # There's no length field, so order by end - start if k == 'length': k = '(end - start)' _order_by.append(k) _order_by = ','.join(_order_by) if reverse: direction = 'DESC' else: direction = 'ASC' d['ORDER_BY'] = 'ORDER BY %s %s' % (_order_by, direction) # Ensure only one "WHERE" is included; the rest get "AND ". This is ugly. where = False if "where" in d['OTHER'].lower(): where = True for i in ['EXTRA', 'FEATURETYPE', 'LIMIT', 'STRAND']: if d[i]: if not where: d[i] = "WHERE " + d[i] where = True else: d[i] = "AND " + d[i] return _QUERY.format(**d), args
def __init__(self, seqid=".", source=".", featuretype=".", start=".", end=".", score=".", strand=".", frame=".", attributes=None, extra=None, bin=None, id=None, dialect=None, file_order=None, keep_order=False, sort_attribute_values=False): """ Represents a feature from the database. When printed, reproduces the original line from the file as faithfully as possible using `dialect`. Usually you won't want to use this directly, since it has various implementation details needed for operating in the context of FeatureDB objects. Instead, try the :func:`feature_from_line` function. Parameters ---------- seqid : string Name of the sequence (often chromosome) source : string Source of the feature; typically the originating database or program that predicted the feature featuretype : string Type of feature. For example "gene", "exon", "TSS", etc start, end : int or "." 1-based coordinates; start must be <= end. If "." (the default placeholder for GFF files), then the corresponding attribute will be None. score : string Stored as a string. strand : "+" | "-" | "." Strand of the feature; "." when strand is not relevant. frame : "0" | "1" | "2" Coding frame. 0 means in-frame; 1 means there is one extra base at the beginning, so the first codon starts at the second base; 2 means two extra bases at the beginning. Interpretation is strand specific; "beginning" for a minus-strand feature is at the end coordinate. attributes : string or dict If a string, first assume it is serialized JSON; if this fails then assume it's the original key/vals string. If it's a dictionary already, then use as-is. The end result is that this instance's `attributes` attribute will always be a dictionary. Upon printing, the attributes will be reconstructed based on this dictionary and the dialect -- except if the original attributes string was provided, in which case that will be used directly. extra : string or list Additional fields after the canonical 9 fields for GFF/GTF. If a string, then first assume it's serialized JSON; if this fails then assume it's a tab-delimited string of additional fields. If it's a list already, then use as-is. bin : int UCSC genomic bin. If None, will be created based on provided start/end; if start or end is "." then bin will be None. id : None or string Database-specific primary key for this feature. The only time this should not be None is if this feature is coming from a database, in which case it will be filled in automatically. dialect : dict or None The dialect to use when reconstructing attribute strings; defaults to the GFF3 spec. :class:`FeatureDB` objects will automatically attach the dialect from the original file. file_order : int This is the `rowid` special field used in a sqlite3 database; this is provided by FeatureDB. keep_order : bool If True, then the attributes in the printed string will be in the order specified in the dialect. Disabled by default, since this sorting step is time-consuming over many features. sort_attribute_values : bool If True, then the values of each attribute will be sorted when the feature is printed. Mostly useful for testing, where the order is important for checking against expected values. Disabled by default, since it can be time-consuming over many features. """ # start/end can be provided as int-like, ".", or None, but will be # converted to int or None if start == ".": start = None elif start is not None: start = int(start) if end == ".": end = None elif end is not None: end = int(end) # Flexible handling of attributes: # If dict, then use that; otherwise assume JSON and convert to a dict; # otherwise assume original string and convert to a dict. # # dict_class is set at the module level above...this is so you can swap # in and out different dict implementations (ordered, defaultdict, etc) # for testing. attributes = attributes or dict_class() if isinstance(attributes, six.string_types): try: attributes = helpers._unjsonify(attributes, isattributes=True) # it's a string but not JSON: assume original attributes string. except simplejson.JSONDecodeError: # But Feature.attributes is still a dict attributes, _dialect = parser._split_keyvals(attributes) # Use this dialect if none provided. dialect = dialect or _dialect # If string, then try un-JSONifying it into a list; if that doesn't # work then assume it's tab-delimited and convert to a list. extra = extra or [] if isinstance(extra, six.string_types): try: extra = helpers._unjsonify(extra) except simplejson.JSONDecodeError: extra = extra.split('\t') # Calculate bin if not provided if bin is None: try: bin = bins.bins(start, end, one=True) except TypeError: bin = None self.seqid = seqid self.source = source self.featuretype = featuretype self.start = start self.end = end self.score = score self.strand = strand self.frame = frame self.attributes = attributes self.extra = extra self.bin = bin self.id = id self.dialect = dialect or constants.dialect self.file_order = file_order self.keep_order = keep_order self.sort_attribute_values = sort_attribute_values