Esempio n. 1
0
def guess_if_nucleic_acid(seq, thresh = 0.90,
                          nucleic_acid_letters=NUCLEIC_ACID_LETTERS):
    """Guess if the given sequence is a nucleic acid.

    It's considered nucleic acid if more than 90% of the sequence is
    nucleic_acid_letters (uppercase). The threshold is configurable
    via the thresh parameter.

    Based on http://www.mailinglistarchive.com/html/[email protected]/2009-01/msg00009.html
    (Author: Brad Chapman)
    """
    
    assert isinstance(seq, Seq.Seq) or \
      isinstance(seq, type("")) or \
      isinstance(seq, type(u""))

    # could use Seq.ungap if Seq.Seq
    seq = bioutils.ungap(str(seq).upper())
    
    nuc_alpha_count = 0
    # don't use collections Counter (Python 2.7 only)
    for letter in nucleic_acid_letters:
        nuc_alpha_count += seq.count(letter)
        #print "DEBUG", len(seq), letter, seq.count(letter)

    if len(seq) == 0:
        return False
    elif float(nuc_alpha_count) / float(len(seq)) >= thresh:
        return False
    else:
        return True
Esempio n. 2
0
def guess_if_nucleic_acid(seq,
                          thresh=0.90,
                          nucleic_acid_letters=NUCLEIC_ACID_LETTERS):
    """Guess if the given sequence is a nucleic acid.

    It's considered nucleic acid if more than 90% of the sequence is
    nucleic_acid_letters (uppercase). The threshold is configurable
    via the thresh parameter.

    Based on http://www.mailinglistarchive.com/html/[email protected]/2009-01/msg00009.html
    (Author: Brad Chapman)
    """

    assert isinstance(seq, Seq.Seq) or \
      isinstance(seq, type("")) or \
      isinstance(seq, type(u""))

    # could use Seq.ungap if Seq.Seq
    seq = bioutils.ungap(str(seq).upper())

    nuc_alpha_count = 0
    # don't use collections Counter (Python 2.7 only)
    for letter in nucleic_acid_letters:
        nuc_alpha_count += seq.count(letter)
        #print "DEBUG", len(seq), letter, seq.count(letter)

    if len(seq) == 0:
        return False
    elif float(nuc_alpha_count) / float(len(seq)) >= thresh:
        return False
    else:
        return True
Esempio n. 3
0
def pairwise_identity(s1, s2):
    """Return fractional pairwise identity between two aligned
    strings, which is defined here as the number of identical residues
    (case sensitive), divived by the smaller of the two ungapped
    sequences.

    Uppercase your sequence for case insensitivity. For mixed RNA/DNA
    you might want to replace T's with U's vice versa.
    
    Based on ideas from
    http://code.activestate.com/recipes/499304-hamming-distance/
    """

    assert len(s1) == len(s2)
    idents = sum(c1 == c2 for c1, c2 in izip(s1, s2)
                 if not bioutils.isgap(c1) and not bioutils.isgap(c2))
    min_ungapped_len = min(len(bioutils.ungap(s1)), len(bioutils.ungap(s2)))
    return idents / float(min_ungapped_len)
Esempio n. 4
0
def pairwise_identity(s1, s2):
    """Return fractional pairwise identity between two aligned
    strings, which is defined here as the number of identical residues
    (case sensitive), divived by the smaller of the two ungapped
    sequences.

    Uppercase your sequence for case insensitivity. For mixed RNA/DNA
    you might want to replace T's with U's vice versa.
    
    Based on ideas from
    http://code.activestate.com/recipes/499304-hamming-distance/
    """
    
    assert len(s1) == len(s2)
    idents = sum(c1 == c2
                 for c1, c2 in izip(s1, s2) 
                 if not bioutils.isgap(c1) and not bioutils.isgap(c2))
    min_ungapped_len = min(len(bioutils.ungap(s1)), len(bioutils.ungap(s2)))
    return idents / float(min_ungapped_len)
Esempio n. 5
0
def main():
    """
    The main function
    """

    parser = cmdline_parser()
    (opts, args) = parser.parse_args()

    if opts.verbose:
        LOG.setLevel(logging.INFO)
    if opts.debug:
        LOG.setLevel(logging.DEBUG)
        
    if len(args) != 1:
        parser.error("Need sequence file as input argument")
        sys.exit(1)

        
    fseq = args[0]
    if fseq == "-":
        fhandle = sys.stdin
    else:
        fhandle = open(fseq, "rU")


    fmt = opts.informat
    if not fmt:
        fmt = bioutils.guess_seqformat(fseq)


    seqrecs = []
    seqlens = []
    seqlens_ungapped = []
    
    # read all into memory: makes id computation easier. the file
    # might come from stdin so we can't read twice
    for seqrec in SeqIO.parse(fhandle, fmt):
        seqrecs.append(seqrec)
        seqlens.append(len(seqrec.seq))
        seqlens_ungapped.append(len(bioutils.ungap(str(seqrec.seq))))
    if fhandle != sys.stdin:
        fhandle.close()
            
    nseqs = len(seqlens)        
    if nseqs == 0:
        LOG.warn('No sequences found. Try changing the format (just tried: %s)' % fmt)
        sys.exit(0)

    
    aligned = False
    if nseqs > 1 and len(set(seqlens)) == 1:
        # add and len(set(seqlens_ungapped)) != 1 to make sure
        # unaligend sequence length are identical
        aligned = True
        aln_len = seqlens[0] # any will do as we know they're aligned
        pw_id_mx = comp_pairwise_ident_matrix(seqrecs)

    if not aligned and seqlens != seqlens_ungapped:
        LOG.warn("Found gaps, but sequences do not seem to be aligned."
                 " Stats will be for ungapped seqs.")
         
    # guess type from first entry
    if guess_if_nucleic_acid(seqrecs[0].seq):
        seqtype = 'protein' 
    else:
        seqtype = 'nucleic'
    print "Type (of 1st seq):   %s" % (seqtype)

    print "Number of sequences: %d" % (nseqs)        
    print "Smallest:            %d" % (
        min(seqlens_ungapped))
    print "Largest:             %d" % (
        max(seqlens_ungapped))
    print "Average length:      %.1f" % (
        sum(seqlens_ungapped)/float(len(seqlens_ungapped)))
    #print "Format:              %s" % (fmt)
    
    print "Aligned:             %s" % ("yes" if aligned else "no")
    if aligned:
        # make sure to ignore self-comparison None's
        flat_pw_id_mx = [x for x in chain.from_iterable(pw_id_mx) if x]
        print "Alignment length:    %d" % (aln_len)        
        (mean, std) = meanstd(flat_pw_id_mx)
        print "Average identity:    %0.2f" % (
            mean)
        print "Standard deviation:  %0.2f" % (
            std)
        print "Most related pair:   %0.2f" % (
            max(flat_pw_id_mx))
        print "Most unrelated pair: %0.2f" % (
            min(flat_pw_id_mx))
    
    if opts.info_for_all:
        # spacer
        print ""
        
        header = "# Name\tLength"
        if aligned:
            header += "\thigh-id to\tlow-id to"
        print header
        
        for (i, seqrec) in enumerate(seqrecs):
            line = "%s\t%d" % (
                seqrec.id, seqlens_ungapped[i])
            
            if aligned:
                # construct list of pairwise ids from fake matrix. 
                pw_ids = pw_id_mx[i]
                pw_ids.extend([pw_id_mx[j][i] 
                               for j in xrange(i+1, nseqs)])
                assert len(pw_ids) == nseqs, (
                    "len(pw_ids)=%d, but expected %d" % (len(pw_ids), nseqs))

                # Find min and max and corresponding partner index,
                # but take care to ignore self-comparison value 'None'
                pw_ids[i] = -1.0
                (pw_id_max_idx, pw_id_max_val) = argminmax(pw_ids, 'max')
                pw_ids[i] = 1.1
                (pw_id_min_idx, pw_id_min_val) = argminmax(pw_ids, 'min')
                pw_ids[i] = None # reset even though not strictly necessary

                line += "\t%.4f\t%s\t%.4f\t%s" % (
                    pw_id_max_val, seqrecs[pw_id_max_idx].id,
                    pw_id_min_val, seqrecs[pw_id_min_idx].id)
            print line

    print "%d names are unique and %d sequences are unique (including gaps)." % (
        len(set([s.id for s in seqrecs])),
        len(set([str(s.seq) for s in seqrecs])))
Esempio n. 6
0
def main():
    """
    The main function
    """

    parser = cmdline_parser()
    (opts, args) = parser.parse_args()

    if opts.verbose:
        LOG.setLevel(logging.INFO)
    if opts.debug:
        LOG.setLevel(logging.DEBUG)

    if len(args) != 1:
        parser.error("Need sequence file as input argument")
        sys.exit(1)

    fseq = args[0]
    if fseq == "-":
        fhandle = sys.stdin
    else:
        fhandle = open(fseq, "rU")

    fmt = opts.informat
    if not fmt:
        fmt = bioutils.guess_seqformat(fseq)

    seqrecs = []
    seqlens = []
    seqlens_ungapped = []

    # read all into memory: makes id computation easier. the file
    # might come from stdin so we can't read twice
    for seqrec in SeqIO.parse(fhandle, fmt):
        seqrecs.append(seqrec)
        seqlens.append(len(seqrec.seq))
        seqlens_ungapped.append(len(bioutils.ungap(str(seqrec.seq))))
    if fhandle != sys.stdin:
        fhandle.close()

    nseqs = len(seqlens)
    if nseqs == 0:
        LOG.warn(
            'No sequences found. Try changing the format (just tried: %s)' %
            fmt)
        sys.exit(0)

    aligned = False
    if nseqs > 1 and len(set(seqlens)) == 1:
        # add and len(set(seqlens_ungapped)) != 1 to make sure
        # unaligend sequence length are identical
        aligned = True
        aln_len = seqlens[0]  # any will do as we know they're aligned
        pw_id_mx = comp_pairwise_ident_matrix(seqrecs)

    if not aligned and seqlens != seqlens_ungapped:
        LOG.warn("Found gaps, but sequences do not seem to be aligned."
                 " Stats will be for ungapped seqs.")

    # guess type from first entry
    if guess_if_nucleic_acid(seqrecs[0].seq):
        seqtype = 'protein'
    else:
        seqtype = 'nucleic'
    print "Type (of 1st seq):   %s" % (seqtype)

    print "Number of sequences: %d" % (nseqs)
    print "Smallest:            %d" % (min(seqlens_ungapped))
    print "Largest:             %d" % (max(seqlens_ungapped))
    print "Average length:      %.1f" % (sum(seqlens_ungapped) /
                                         float(len(seqlens_ungapped)))
    #print "Format:              %s" % (fmt)

    print "Aligned:             %s" % ("yes" if aligned else "no")
    if aligned:
        # make sure to ignore self-comparison None's
        flat_pw_id_mx = [x for x in chain.from_iterable(pw_id_mx) if x]
        print "Alignment length:    %d" % (aln_len)
        (mean, std) = meanstd(flat_pw_id_mx)
        print "Average identity:    %0.2f" % (mean)
        print "Standard deviation:  %0.2f" % (std)
        print "Most related pair:   %0.2f" % (max(flat_pw_id_mx))
        print "Most unrelated pair: %0.2f" % (min(flat_pw_id_mx))

    if opts.info_for_all:
        # spacer
        print ""

        header = "# Name\tLength"
        if aligned:
            header += "\thigh-id to\tlow-id to"
        print header

        for (i, seqrec) in enumerate(seqrecs):
            line = "%s\t%d" % (seqrec.id, seqlens_ungapped[i])

            if aligned:
                # construct list of pairwise ids from fake matrix.
                pw_ids = pw_id_mx[i]
                pw_ids.extend([pw_id_mx[j][i] for j in xrange(i + 1, nseqs)])
                assert len(pw_ids) == nseqs, (
                    "len(pw_ids)=%d, but expected %d" % (len(pw_ids), nseqs))

                # Find min and max and corresponding partner index,
                # but take care to ignore self-comparison value 'None'
                pw_ids[i] = -1.0
                (pw_id_max_idx, pw_id_max_val) = argminmax(pw_ids, 'max')
                pw_ids[i] = 1.1
                (pw_id_min_idx, pw_id_min_val) = argminmax(pw_ids, 'min')
                pw_ids[i] = None  # reset even though not strictly necessary

                line += "\t%.4f\t%s\t%.4f\t%s" % (
                    pw_id_max_val, seqrecs[pw_id_max_idx].id, pw_id_min_val,
                    seqrecs[pw_id_min_idx].id)
            print line

    print "%d names are unique and %d sequences are unique (including gaps)." % (
        len(set([s.id
                 for s in seqrecs])), len(set([str(s.seq) for s in seqrecs])))