Example #1
0
def aband(x,y,z,antal,frekmin,frekmax,btype,fmin,fmax=None):
    #highpass
    if btype==1:
        [frek,p,a,b]=aft.aft(x,y,z,frekmin,frekmax,antal)
        [c,d]=bins.bins(frek,fmin)        
        wp=awindow.awindow(x,z,frek)
        for j in range(len(y)):
            for i in range(d):
                y[j]=y[j]-(a[i]*sin(2*pi*frek[i]*x[j])+b[i]*cos(2*pi*frek[i]*x[j]))/(wp)
        return [y]
    #lowpass
    elif btype==2:
        [frek,p,a,b]=aft.aft(x,y,z,frekmin,frekmax,antal)
        [c,d]=bins.bins(frek,fmin)  
        wp=awindow.awindow(x,z,frek)
        for j in range(len(y)):
            u=0
            for i in range(c):
                u+=a[i]*sin(2*pi*frek[i]*x[j])+b[i]*cos(2*pi*frek[i]*x[j])
            y[j]=u/(wp)
        return [y]
    #bandpass
    else:
        [frek,p,a,b]=aft.aft(x,y,z,frekmin,frekmax,antal)
        [c,d]=bins.bins(frek,fmin)
        [e,f]=bins.bins(frek,fmax)
        wp=awindow.awindow(x,z,frek)
        for j in range(len(y)):
            u=0
            for i in range(c,f):
                u+=a[i]*sin(2*pi*frek[i]*x[j])+b[i]*cos(2*pi*frek[i]*x[j])
            y[j]=u/float(wp)
        return [y]
Example #2
0
def binrange(data_list,
             enough=None,
             cohen=0.2,
             maxBins=16,
             minBin=4,
             trivial=1.05):
    """

    :param data_list:
    :param enough:
    :param cohen:
    :param maxBins:
    :param minBin:
    :param trivial:
    :return: ist of bin# e.g. {a,b,c,d,e} [a,b] (b,c] (c,d] (d,e]
    """
    ranges = bins.bins(t=data_list,
                       enough=enough,
                       cohen=cohen,
                       maxBins=maxBins,
                       minBin=minBin,
                       trivial=trivial)
    res = [ranges[0].lo]
    for r in ranges:
        res.append(r.up)
    return res
Example #3
0
def _bin_from_dict(d):
    """
    Given a dictionary yielded by the parser, return the genomic "UCSC" bin
    """
    try:
        start = int(d['start'])
        end = int(d['end'])
        return bins.bins(start, end, one=True)

    # e.g., if "."
    except ValueError:
        return None
Example #4
0
def make_query(args, other=None, limit=None, strand=None, featuretype=None,
               extra=None, order_by=None, reverse=False,
               completely_within=False):
    """
    This function composes queries given some commonly-used kwargs that can be
    passed to FeatureDB methods (like .parents(), .children(), .all_features(),
    .features_of_type()).  It handles, in one place, things like restricting to
    featuretype, limiting to a genomic range, limiting to one strand, or
    returning results ordered by different criteria.

    Additional filtering/subsetting/sorting behavior should be added here.

    (Note: this ended up having better performance (and flexibility) than
    sqlalchemy)

    This function also provides support for additional JOINs etc (supplied via
    the `other` kwarg) and extra conditional clauses (`extra` kwarg).  See the
    `_QUERY` var below for the order in which they are used.

    For example, FeatureDB._relation uses `other` to supply the JOIN
    substatment, and that same method also uses `extra` to supply the
    "relations.level = ?" substatment (see the source for FeatureDB._relation
    for more details).

    `args` contains the arguments that will ultimately be supplied to the
    sqlite3.connection.execute function.  It may be further populated below --
    for example, if strand="+", then the query will include a strand clause,
    and the strand will be appended to the args.

    `args` can be pre-filled with args that are passed to `other` and `extra`.
    """

    _QUERY = ("{_SELECT} {OTHER} {EXTRA} {FEATURETYPE} "
              "{LIMIT} {STRAND} {ORDER_BY}")

    # Construct a dictionary `d` that will be used later as _QUERY.format(**d).
    # Default is just _SELECT, which returns all records in the features table.
    # (Recall that constants._SELECT gets the fields in the order needed to
    # reconstruct a Feature)
    d = dict(_SELECT=constants._SELECT, OTHER="", FEATURETYPE="", LIMIT="",
             STRAND="", ORDER_BY="", EXTRA="")

    if other:
        d['OTHER'] = other
    if extra:
        d['EXTRA'] = extra

    # If `other` and `extra` take args (that is, they have "?" in them), then
    # they should have been provided in `args`.
    required_args = (d['EXTRA'] + d['OTHER']).count('?')
    if len(args) != required_args:
        raise ValueError('Not enough args (%s) for subquery' % args)

    # Below, if a kwarg is specified, then we create sections of the query --
    # appending to args as necessary.
    #
    # IMPORTANT: the order in which things are processed here is the same as
    # the order of the placeholders in _QUERY.  That is, we need to build the
    # args in parallel with the query to avoid putting the wrong args in the
    # wrong place.

    if featuretype:
        # Handle single or iterables of featuretypes.
        #
        # e.g., "featuretype = 'exon'"
        #
        # or, "featuretype IN ('exon', 'CDS')"
        if isinstance(featuretype, basestring):
            d['FEATURETYPE'] = "features.featuretype = ?"
            args.append(featuretype)
        else:
            d['FEATURETYPE'] = (
                "features.featuretype IN  (%s)"
                % (','.join(["?" for _ in featuretype]))
            )
            args.extend(featuretype)

    if limit:
        # Restrict to a genomic region.  Makes use of the UCSC binning strategy
        # for performance.
        #
        # `limit` is a string or a tuple of (chrom, start, stop)
        #
        # e.g., "seqid = 'chr2L' AND start > 1000 AND end < 5000"
        if isinstance(limit, basestring):
            seqid, startstop = limit.split(':')
            start, end = startstop.split('-')
        else:
            seqid, start, end = limit

        # Identify possible bins
        _bins = bins.bins(int(start), int(end), one=False)

        # Use different overlap conditions
        if completely_within:
            d['LIMIT'] = (
                "features.seqid = ? AND features.start >= ? "
                "AND features.end <= ?"
            )
            args.extend([seqid, start, end])

        else:
            d['LIMIT'] = (
                "features.seqid = ? AND features.start <= ? "
                "AND features.end >= ?"
            )
            # Note order (end, start)
            args.extend([seqid, end, start])

        # Add bin clause
        d['LIMIT'] += " AND features.bin IN (%s)" % (','.join(map(str, _bins)))

    if strand:
        # e.g., "strand = '+'"
        d['STRAND'] = "features.strand = ?"
        args.append(strand)

    # TODO: implement file_order!
    valid_order_by = constants._gffkeys_extra + ['file_order', 'length']
    _order_by = []
    if order_by:
        # Default is essentially random order.
        #
        # e.g. "ORDER BY seqid, start DESC"
        if isinstance(order_by, basestring):
            _order_by.append(order_by)

        else:
            for k in order_by:
                if k not in valid_order_by:
                    raise ValueError("%s not a valid order-by value in %s"
                                     % (k, valid_order_by))

                # There's no length field, so order by end - start
                if k == 'length':
                    k = '(end - start)'

                _order_by.append(k)

        _order_by = ','.join(_order_by)
        if reverse:
            direction = 'DESC'
        else:
            direction = 'ASC'
        d['ORDER_BY'] = 'ORDER BY %s %s' % (_order_by, direction)

    # Ensure only one "WHERE" is included; the rest get "AND ".  This is ugly.
    where = False
    if "where" in d['OTHER'].lower():
        where = True
    for i in ['EXTRA', 'FEATURETYPE', 'LIMIT', 'STRAND']:
        if d[i]:
            if not where:
                d[i] = "WHERE " + d[i]
                where = True
            else:
                d[i] = "AND " + d[i]

    return _QUERY.format(**d), args
Example #5
0
    def __init__(self, seqid=".", source=".", featuretype=".",
                 start=".", end=".", score=".", strand=".", frame=".",
                 attributes=None, extra=None, bin=None, id=None, dialect=None,
                 file_order=None, keep_order=False):
        """
        Represents a feature from the database.

        When printed, reproduces the original line from the file as faithfully
        as possible using `dialect`.

        Usually you won't want to use this directly, since it has various
        implementation details needed for operating in the context of FeatureDB
        objects.  Instead, try the :func:`feature_from_line` function.

        Parameters
        ----------

        seqid : string
            Name of the sequence (often chromosome)

        source : string
            Source of the feature; typically the originating database or
            program that predicted the feature

        featuretype : string
            Type of feature.  For example "gene", "exon", "TSS", etc

        start, end : int or "."
            1-based coordinates; start must be <= end.  If "." (the default
            placeholder for GFF files), then the corresponding attribute will
            be None.

        score : string
            Stored as a string.

        strand : "+" | "-" | "."
            Strand of the feature; "." when strand is not relevant.

        frame : "0" | "1" | "2"
            Coding frame.  0 means in-frame; 1 means there is one extra base at
            the beginning, so the first codon starts at the second base;
            2 means two extra bases at the beginning.  Interpretation is strand
            specific; "beginning" for a minus-strand feature is at the end
            coordinate.

        attributes : string or dict
            If a string, first assume it is serialized JSON; if this fails then
            assume it's the original key/vals string.  If it's a dictionary
            already, then use as-is.

            The end result is that this instance's `attributes` attribute will
            always be a dictionary.

            Upon printing, the attributes will be reconstructed based on this
            dictionary and the dialect -- except if the original attributes
            string was provided, in which case that will be used directly.

        extra : string or list
            Additional fields after the canonical 9 fields for GFF/GTF.

            If a string, then first assume it's serialized JSON; if this fails
            then assume it's a tab-delimited string of additional fields.  If
            it's a list already, then use as-is.

        bin : int
            UCSC genomic bin. If None, will be created based on provided
            start/end; if start or end is "." then bin will be None.

        id : None or string
            Database-specific primary key for this feature.  The only time this
            should not be None is if this feature is coming from a database, in
            which case it will be filled in automatically.

        dialect : dict or None

            The dialect to use when reconstructing attribute strings; defaults
            to the GFF3 spec.  :class:`FeatureDB` objects will automatically
            attach the dialect from the original file.

        file_order : int
            This is the `rowid` special field used in a sqlite3 database; this
            is provided by FeatureDB.

        keep_order : bool
            If True, then the attributes in the printed string will be in the
            order specified in the dialect.  Disabled by default, since this
            sorting step is time-consuming over many features.

        """
        # start/end can be provided as int-like, ".", or None, but will be
        # converted to int or None
        if start == ".":
            start = None
        elif start is not None:
            start = int(start)
        if end == ".":
            end = None
        elif end is not None:
            end = int(end)

        # Flexible handling of attributes:
        # If dict, then use that; otherwise assume JSON and convert to a dict;
        # otherwise assume original string and convert to a dict.
        #
        # dict_class is set at the module level above...this is so you can swap
        # in and out different dict implementations (ordered, defaultdict, etc)
        # for testing.
        attributes = attributes or dict_class()

        if isinstance(attributes, basestring):
            try:
                attributes = helpers._unjsonify(attributes, isattributes=True)

            # it's a string but not JSON: assume original attributes string.
            except simplejson.JSONDecodeError:

                # But Feature.attributes is still a dict
                attributes, _dialect = parser._split_keyvals(attributes)

                # Use this dialect if none provided.
                dialect = dialect or _dialect

        # If string, then try un-JSONifying it into a list; if that doesn't
        # work then assume it's tab-delimited and convert to a list.
        extra = extra or []
        if isinstance(extra, basestring):
            try:
                extra = helpers._unjsonify(extra)
            except simplejson.JSONDecodeError:
                extra = extra.split('\t')

        # Calculate bin if not provided
        if bin is None:
            try:
                bin = bins.bins(start, end, one=True)
            except TypeError:
                bin = None

        self.seqid = seqid
        self.source = source
        self.featuretype = featuretype
        self.start = start
        self.end = end
        self.score = score
        self.strand = strand
        self.frame = frame
        self.attributes = attributes
        self.extra = extra
        self.bin = bin
        self.id = id
        self.dialect = dialect or constants.dialect
        self.file_order = file_order
        self.keep_order = keep_order
Example #6
0
    def interfeatures(self, features, new_featuretype=None,
                      merge_attributes=True, dialect=None):
        """
        Construct new features representing the space between features.

        For example, if `features` is a list of exons, then this method will
        return the introns.  If `features` is a list of genes, then this method
        will return the intergenic regions.

        Providing N features will return N - 1 new features.

        This method purposefully does *not* do any merging or sorting of
        coordinates, so you may want to use :meth:`FeatureDB.merge` first.

        The new features' attributes will be a merge of the neighboring
        features' attributes.  This is useful if you have provided a list of
        exons; the introns will then retain the transcript and/or gene parents.

        Parameters
        ----------
        features : iterable of :class:`feature.Feature` instances
            Sorted, merged iterable

        new_featuretype : string or None
            The new features will all be of this type, or, if None (default)
            then the featuretypes will be constructed from the neighboring
            features, e.g., `inter_exon_exon`.

        attribute_func : callable or None
            If None, then nothing special is done to the attributes.  If
            callable, then the callable accepts two attribute dictionaries and
            returns a single attribute dictionary.  If `merge_attributes` is
            True, then `attribute_func` is called before `merge_attributes`.
            This could be useful for manually managing IDs for the new
            features.
        """
        for i, f in enumerate(features):
            # no inter-feature for the first one
            if i == 0:
                interfeature_start = f.stop
                last_feature = f
                continue

            interfeature_stop = f.start
            if new_featuretype is None:
                new_featuretype = 'inter_%s_%s' % (
                    last_feature.featuretype, f.featuretype)
            assert last_feature.strand == f.strand
            assert last_feature.chrom == f.chrom
            strand = last_feature.strand
            chrom = last_feature.chrom

            # Shrink
            interfeature_start += 1
            interfeature_stop -= 1

            new_attributes = helpers.merge_attributes(
                last_feature.attributes, f.attributes)

            new_bin = bins.bins(
                interfeature_start, interfeature_stop, one=True)
            _id = None
            fields = dict(
                seqid=chrom,
                source='gffutils_derived',
                featuretype=new_featuretype,
                start=interfeature_start,
                end=interfeature_stop,
                score='.',
                strand=strand,
                frame='.',
                attributes=new_attributes,
                bin=new_bin)

            if dialect is None:
                # Support for @classmethod -- if calling from the class, then
                # self.dialect is not defined, so defer to Feature's default
                # (which will be constants.dialect, or GFF3).
                try:
                    dialect = self.dialect
                except AttributeError:
                    dialect = None
            yield self._feature_returner(**fields)
            interfeature_start = f.stop
Example #7
0
    def region(self, region, featuretype=None, completely_within=False):
        """
        Return features with any part overlapping `region`.

        Parameters
        ----------
        region : string, tuple, or Feature instance
            If string, then of the form "seqid:start-end".  If tuple, then
            (seqid, start, end).  If :class:`Feature`, then use the features
            seqid, start, and end values.

        featuretype : None, string, or iterable
            If not None, then restrict output.  If string, then only report
            that feature type.  If iterable, then report all featuretypes in
            the iterable.

        completely_within : bool
            If False (default), returns features that overlap `region`, even
            partially.  If True, only return features that are completely
            within `region`.
        """
        strand = None
        if isinstance(region, basestring):
            toks = region.split(':')
            seqid, coords = toks[:2]
            if len(toks) == 3:
                strand = toks[2]
            start, end = coords.split('-')

        elif isinstance(region, Feature):
            seqid = region.seqid
            start = region.start
            end = region.end
            strand = region.strand
        else:
            seqid, start, end = region[:3]
            if len(region) == 4:
                strand = region[3]

        # Get a list of all possible bins for this region
        _bins = list(bins.bins(int(start), int(end), one=False))

        if completely_within:
            position_clause = 'start >= ? AND end <= ?'
            args = [seqid, start, end]
        else:
            position_clause = 'start < ? AND end > ?'
            # note start/end swap
            args = [seqid, end, start]

        args += _bins

        _bin_clause = ' or ' .join(['bin = ?' for _ in _bins])

        query = ' '.join([
            constants._SELECT,
            'WHERE seqid = ? AND', position_clause,
            'AND', '(', _bin_clause, ')'])

        # Add the featuretype clause
        if featuretype is not None:
            if isinstance(featuretype, basestring):
                featuretype = [featuretype]
            feature_clause = ' or '.join(
                ['featuretype = ?' for _ in featuretype])
            query += ' AND (%s) ' % feature_clause
            args.extend(featuretype)

        if strand is not None:
            strand_clause = ' and strand = ? '
            query += strand_clause
            args.append(strand)

        c = self.conn.cursor()
        c.execute(query, tuple(args))
        for i in c:
            yield self._feature_returner(**i)
Example #8
0
    def _update_relations(self):

        if not self.infer_gene_extent:
            return

        # TODO: do any indexes speed this up?
        c = self.conn.cursor()
        c2 = self.conn.cursor()

        logger.info("Creating relations(parent) index")
        c.execute('DROP INDEX IF EXISTS relationsparent')
        c.execute('CREATE INDEX relationsparent ON relations (parent)')
        logger.info("Creating relations(child) index")
        c.execute('DROP INDEX IF EXISTS relationschild')
        c.execute('CREATE INDEX relationschild ON relations (child)')

        logger.info('Inferring gene and transcript extents, '
                    'and writing to tempfile')
        tmp = tempfile.NamedTemporaryFile(delete=False).name
        tmp = '/tmp/gffutils'
        fout = open(tmp, 'w')

        self._tmpfile = tmp

        # This takes some explanation...
        #
        # First, the nested subquery gets the level-1 parents of
        # self.subfeature featuretypes.  For an on-spec GTF file,
        # self.subfeature = "exon". So this subquery translates to getting the
        # distinct level-1 parents of exons -- which are transcripts.
        #
        # OK, so this first subquery is now a list of transcripts; call it
        # "firstlevel".
        #
        # Then join firstlevel on relations, but the trick is to now consider
        # each transcript a *child* -- so that relations.parent (on the first
        # line of the query) will be the first-level parent of the transcript
        # (the gene).
        #
        #
        # The result is something like:
        #
        #   transcript1     gene1
        #   transcript2     gene1
        #   transcript3     gene2
        #
        # Note that genes are repeated; below we need to ensure that only one
        # is added.  To ensure this, the results are ordered by the gene ID.

        c.execute(
            '''
            SELECT DISTINCT firstlevel.parent, relations.parent
            FROM (
                SELECT DISTINCT parent
                FROM relations
                JOIN features ON features.id = relations.child
                WHERE features.featuretype = ?
                AND relations.level = 1
            )
            AS firstlevel
            JOIN relations ON firstlevel.parent = child
            WHERE relations.level = 1
            ORDER BY relations.parent
            ''', (self.subfeature,))

        # Now we iterate through those results (using a new cursor) to infer
        # the extent of transcripts and genes.

        last_gene_id = None
        n_features = 0
        for transcript_id, gene_id in c:
            # transcript extent
            c2.execute(
                '''
                SELECT MIN(start), MAX(end), strand, seqid
                FROM features
                JOIN relations ON
                features.id = relations.child
                WHERE parent = ? AND featuretype == ?
                ''', (transcript_id, self.subfeature))
            transcript_start, transcript_end, strand, seqid = c2.fetchone()
            transcript_attributes = {
                self.transcript_key: [transcript_id],
                self.gene_key: [gene_id]
            }
            transcript_bin = bins.bins(
                transcript_start, transcript_end, one=True)

            # Write out to file; we'll be reading it back in shortly.  Omit
            # score, frame, source, and extra since they will always have the
            # same default values (".", ".", "gffutils_derived", and []
            # respectively)

            fout.write('\t'.join(map(str, [
                transcript_id,
                seqid,
                transcript_start,
                transcript_end,
                strand,
                'transcript',
                transcript_bin,
                helpers._jsonify(transcript_attributes)
            ])) + '\n')

            n_features += 1

            # Infer gene extent, but only if we haven't done so already.
            if gene_id != last_gene_id:
                c2.execute(
                    '''
                    SELECT MIN(start), MAX(end), strand, seqid
                    FROM features
                    JOIN relations ON
                    features.id = relations.child
                    WHERE parent = ? AND featuretype == ?
                    ''', (gene_id, self.subfeature))
                gene_start, gene_end, strand, seqid = c2.fetchone()
                gene_attributes = {self.gene_key: [gene_id]}
                gene_bin = bins.bins(gene_start, gene_end, one=True)

                fout.write('\t'.join(map(str, [
                    gene_id,
                    seqid,
                    gene_start,
                    gene_end,
                    strand,
                    'gene',
                    gene_bin,
                    helpers._jsonify(gene_attributes)
                ])) + '\n')

            last_gene_id = gene_id
            n_features += 1

        fout.close()

        def derived_feature_generator():
            """
            Generator of items from the file that was just created...
            """
            keys = ['parent', 'seqid', 'start', 'end', 'strand',
                    'featuretype', 'bin', 'attributes']
            for line in open(fout.name):
                d = dict(zip(keys, line.strip().split('\t')))
                d.pop('parent')
                d['score'] = '.'
                d['source'] = 'gffutils_derived'
                d['frame'] = '.'
                d['extra'] = []
                d['attributes'] = helpers._unjsonify(d['attributes'])
                f = feature.Feature(**d)
                f.id = self._id_handler(f)
                yield f

        # Drop the indexes so the inserts are faster
        c.execute('DROP INDEX IF EXISTS relationsparent')
        c.execute('DROP INDEX IF EXISTS relationschild')

        # Insert the just-inferred transcripts and genes.  TODO: should we
        # *always* use "merge" here for the merge_strategy?
        logger.info("Importing inferred features into db")
        last_perc = None
        for i, f in enumerate(derived_feature_generator()):
            perc = int(i / float(n_features) * 100)
            if perc != last_perc:
                sys.stderr.write('%s of %s (%s%%)\r' % (i, n_features, perc))
                sys.stderr.flush()
            last_perc = perc
            try:
                self._insert(f, c)
            except sqlite3.IntegrityError:
                fixed, final_strategy = self._do_merge(f, 'merge')
                c.execute(
                    '''
                    UPDATE features SET attributes = ?
                    WHERE id = ?
                    ''', (helpers._jsonify(fixed.attributes),
                          fixed.id))

        logger.info("Committing changes")
        self.conn.commit()
        os.unlink(fout.name)