Ejemplo n.º 1
0
def full_transposon_treatment(seq,overlap,gap,minlength,fastaout,evalue=None,
                              fname=None):
    '''This is where it all comes together. This takes a sequence of
    hits, assumed to constitute an entire a blast search between one
    transposon and one fly genome. (See note below.)  It performs the
    main process of this module -- i.e., creating the input for a
    multiple-alignment -- and dumps that information in FASTA format
    to *fastaout*, which must be a writeable fasta object (see module
    *fasta*). The user is naturally responsible for closing both, if
    appropriate (as it is in almost all cases).

    NOTE: Generally it is best to have *seq* come from the function
    hitsfromcsv(). This can be done implicitly by giving None as the
    first argument, in which case *f* is expected to be a file object
    or filename to be given to hitstocsv().
    '''
    if None not in (seq,fname):
          raise Error("Cannot give both seq and fname arguments")
    elif seq is None: seq = hitsfromcsv(fname)
    for s,hits in utils.groupby(seq,key=_attrget('SSEQID')).iteritems():
       for island in makeislands(hits,gap):
          singles,nests = classifyrecords(island,overlap)
          nests = [stratify(N,minlength) for N in nests]
          if singles or any(nests):
            fastaout.writeentries(resolve_query_overlap(singles,nests,overlap))
          else: raise Error('No records result from file {!r}'.format(fname))
Ejemplo n.º 2
0
def stratify(nest,minlength):
    '''Generator function that takes a nest, i.e. a list of hits for which
    each adjacent pair has nontrivial overlap in the subject ordinates,
    and yields according to the following process:
    - Yield x, the hit with lowest EVALUE (largest length is tiebreaker)
    - For each other hit y, truncate (or if necessary, split) y so as to
      remove any overlap with x. If this makes y a trivial hit, i.e. one
      whose length is less than minlength, remove it from the nest entirely.
    - Repeat until the nest has been exhausted.
    
    The process is actually implemented in an abstract fashion using a
    helper function - see _stratify().
    '''
    return _stratify(nest,
        rank=hit_rank,
        filterfunc=lambda x: x.LENGTH > minlength,
        sget=_attrget('_SSTART'),
        eget=_attrget('_SEND'),
        sset=set__SSTART,
        eset=set__SEND)
Ejemplo n.º 3
0
def resolve_query_overlap(standalones,nests,overlap):
    '''Expects a list of standalone fragments and a list of nests. Nests are
    expected to have undergone the subject overlap truncation scheme (see
    *stratify*). The return value is a list of fasta entries. If the function
    detects no query overlap between any pair of fragments -- including those
    in nests -- the fragments are "assembled" (non-technical term) in order
    of query-ordinates into a single fasta entry, which is the only element of
    the returned list.
    '''
    standalones = list(standalones)
    nests = map(list,nests)
    if not (standalones or any(nests)):
      raise Error('Tried to resolve query overlap on an empty set of records!')
    
    # assign names before reordering
    for j,hit in enumerate(standalones): setname(hit,'standalone[{}]'.format(j))
    for i,nest in enumerate(nests,1):
        for j,hit in enumerate(nest): setname(hit,'nest{}[{}]'.format(i,j))
    
    # and then reorder by query ordinate
    recs = sorted(_it.chain(standalones,*nests),key=_attrget('QSTART'))
    
    if any(q_overlap(x,y)>=overlap for x,y in _it.izip(recs,recs[1:]))\
        or len(recs)==1: return _it.imap(make_entry,recs)
    prev = None
    with _cont.closing(_sIO()) as seq:
        for hit in recs:
            seq.write('-'*(hit.QSTART-1-( prev and prev.QEND or 0 )))
            if prev is None: st,end = hit._SSTART,hit._SEND
            else: st,end = min(st,hit._SSTART),max(end,hit._SEND);
            seq.write(hit.SSEQ)
            prev = hit
        result = fasta.seq_entry({'SEQ': seq.getvalue(),
         'NAME': _name_fmt.format(_GRP='all',SSEQID=recs[0].SSEQID,
         SSTART=min(h._SSTART for h in recs),SEND=max(h._SEND for h in recs))})
    return [result]
Ejemplo n.º 4
0
def s_overlap(x,y):
    '''Returns the number of base pairs by which the subject ordinates of x
    and y overlap. Returns zero if and only if they are disjoint.'''
    return _overlap(x,y,_attrget('_SSTART'),_attrget('_SEND'))
Ejemplo n.º 5
0
def q_overlap(x,y):
    '''Returns the number of base pairs by which the query ordinates of x
    and y overlap. Returns zero if and only if they are disjoint.'''
    return _overlap(x,y,_attrget('QSTART'),_attrget('QEND'))
Ejemplo n.º 6
0
def s_distance(x,y):
    '''Measures distance between the query ordinates of x and y. Returns 1
    if they are adjacent, 0 if they overlap.'''
    return _dist(x,y,_attrget('QSTART'),_attrget('QEND'))