def guess_if_nucleic_acid(seq, thresh = 0.90, nucleic_acid_letters=NUCLEIC_ACID_LETTERS): """Guess if the given sequence is a nucleic acid. It's considered nucleic acid if more than 90% of the sequence is nucleic_acid_letters (uppercase). The threshold is configurable via the thresh parameter. Based on http://www.mailinglistarchive.com/html/[email protected]/2009-01/msg00009.html (Author: Brad Chapman) """ assert isinstance(seq, Seq.Seq) or \ isinstance(seq, type("")) or \ isinstance(seq, type(u"")) # could use Seq.ungap if Seq.Seq seq = bioutils.ungap(str(seq).upper()) nuc_alpha_count = 0 # don't use collections Counter (Python 2.7 only) for letter in nucleic_acid_letters: nuc_alpha_count += seq.count(letter) #print "DEBUG", len(seq), letter, seq.count(letter) if len(seq) == 0: return False elif float(nuc_alpha_count) / float(len(seq)) >= thresh: return False else: return True
def guess_if_nucleic_acid(seq, thresh=0.90, nucleic_acid_letters=NUCLEIC_ACID_LETTERS): """Guess if the given sequence is a nucleic acid. It's considered nucleic acid if more than 90% of the sequence is nucleic_acid_letters (uppercase). The threshold is configurable via the thresh parameter. Based on http://www.mailinglistarchive.com/html/[email protected]/2009-01/msg00009.html (Author: Brad Chapman) """ assert isinstance(seq, Seq.Seq) or \ isinstance(seq, type("")) or \ isinstance(seq, type(u"")) # could use Seq.ungap if Seq.Seq seq = bioutils.ungap(str(seq).upper()) nuc_alpha_count = 0 # don't use collections Counter (Python 2.7 only) for letter in nucleic_acid_letters: nuc_alpha_count += seq.count(letter) #print "DEBUG", len(seq), letter, seq.count(letter) if len(seq) == 0: return False elif float(nuc_alpha_count) / float(len(seq)) >= thresh: return False else: return True
def pairwise_identity(s1, s2): """Return fractional pairwise identity between two aligned strings, which is defined here as the number of identical residues (case sensitive), divived by the smaller of the two ungapped sequences. Uppercase your sequence for case insensitivity. For mixed RNA/DNA you might want to replace T's with U's vice versa. Based on ideas from http://code.activestate.com/recipes/499304-hamming-distance/ """ assert len(s1) == len(s2) idents = sum(c1 == c2 for c1, c2 in izip(s1, s2) if not bioutils.isgap(c1) and not bioutils.isgap(c2)) min_ungapped_len = min(len(bioutils.ungap(s1)), len(bioutils.ungap(s2))) return idents / float(min_ungapped_len)
def main(): """ The main function """ parser = cmdline_parser() (opts, args) = parser.parse_args() if opts.verbose: LOG.setLevel(logging.INFO) if opts.debug: LOG.setLevel(logging.DEBUG) if len(args) != 1: parser.error("Need sequence file as input argument") sys.exit(1) fseq = args[0] if fseq == "-": fhandle = sys.stdin else: fhandle = open(fseq, "rU") fmt = opts.informat if not fmt: fmt = bioutils.guess_seqformat(fseq) seqrecs = [] seqlens = [] seqlens_ungapped = [] # read all into memory: makes id computation easier. the file # might come from stdin so we can't read twice for seqrec in SeqIO.parse(fhandle, fmt): seqrecs.append(seqrec) seqlens.append(len(seqrec.seq)) seqlens_ungapped.append(len(bioutils.ungap(str(seqrec.seq)))) if fhandle != sys.stdin: fhandle.close() nseqs = len(seqlens) if nseqs == 0: LOG.warn('No sequences found. Try changing the format (just tried: %s)' % fmt) sys.exit(0) aligned = False if nseqs > 1 and len(set(seqlens)) == 1: # add and len(set(seqlens_ungapped)) != 1 to make sure # unaligend sequence length are identical aligned = True aln_len = seqlens[0] # any will do as we know they're aligned pw_id_mx = comp_pairwise_ident_matrix(seqrecs) if not aligned and seqlens != seqlens_ungapped: LOG.warn("Found gaps, but sequences do not seem to be aligned." " Stats will be for ungapped seqs.") # guess type from first entry if guess_if_nucleic_acid(seqrecs[0].seq): seqtype = 'protein' else: seqtype = 'nucleic' print "Type (of 1st seq): %s" % (seqtype) print "Number of sequences: %d" % (nseqs) print "Smallest: %d" % ( min(seqlens_ungapped)) print "Largest: %d" % ( max(seqlens_ungapped)) print "Average length: %.1f" % ( sum(seqlens_ungapped)/float(len(seqlens_ungapped))) #print "Format: %s" % (fmt) print "Aligned: %s" % ("yes" if aligned else "no") if aligned: # make sure to ignore self-comparison None's flat_pw_id_mx = [x for x in chain.from_iterable(pw_id_mx) if x] print "Alignment length: %d" % (aln_len) (mean, std) = meanstd(flat_pw_id_mx) print "Average identity: %0.2f" % ( mean) print "Standard deviation: %0.2f" % ( std) print "Most related pair: %0.2f" % ( max(flat_pw_id_mx)) print "Most unrelated pair: %0.2f" % ( min(flat_pw_id_mx)) if opts.info_for_all: # spacer print "" header = "# Name\tLength" if aligned: header += "\thigh-id to\tlow-id to" print header for (i, seqrec) in enumerate(seqrecs): line = "%s\t%d" % ( seqrec.id, seqlens_ungapped[i]) if aligned: # construct list of pairwise ids from fake matrix. pw_ids = pw_id_mx[i] pw_ids.extend([pw_id_mx[j][i] for j in xrange(i+1, nseqs)]) assert len(pw_ids) == nseqs, ( "len(pw_ids)=%d, but expected %d" % (len(pw_ids), nseqs)) # Find min and max and corresponding partner index, # but take care to ignore self-comparison value 'None' pw_ids[i] = -1.0 (pw_id_max_idx, pw_id_max_val) = argminmax(pw_ids, 'max') pw_ids[i] = 1.1 (pw_id_min_idx, pw_id_min_val) = argminmax(pw_ids, 'min') pw_ids[i] = None # reset even though not strictly necessary line += "\t%.4f\t%s\t%.4f\t%s" % ( pw_id_max_val, seqrecs[pw_id_max_idx].id, pw_id_min_val, seqrecs[pw_id_min_idx].id) print line print "%d names are unique and %d sequences are unique (including gaps)." % ( len(set([s.id for s in seqrecs])), len(set([str(s.seq) for s in seqrecs])))
def main(): """ The main function """ parser = cmdline_parser() (opts, args) = parser.parse_args() if opts.verbose: LOG.setLevel(logging.INFO) if opts.debug: LOG.setLevel(logging.DEBUG) if len(args) != 1: parser.error("Need sequence file as input argument") sys.exit(1) fseq = args[0] if fseq == "-": fhandle = sys.stdin else: fhandle = open(fseq, "rU") fmt = opts.informat if not fmt: fmt = bioutils.guess_seqformat(fseq) seqrecs = [] seqlens = [] seqlens_ungapped = [] # read all into memory: makes id computation easier. the file # might come from stdin so we can't read twice for seqrec in SeqIO.parse(fhandle, fmt): seqrecs.append(seqrec) seqlens.append(len(seqrec.seq)) seqlens_ungapped.append(len(bioutils.ungap(str(seqrec.seq)))) if fhandle != sys.stdin: fhandle.close() nseqs = len(seqlens) if nseqs == 0: LOG.warn( 'No sequences found. Try changing the format (just tried: %s)' % fmt) sys.exit(0) aligned = False if nseqs > 1 and len(set(seqlens)) == 1: # add and len(set(seqlens_ungapped)) != 1 to make sure # unaligend sequence length are identical aligned = True aln_len = seqlens[0] # any will do as we know they're aligned pw_id_mx = comp_pairwise_ident_matrix(seqrecs) if not aligned and seqlens != seqlens_ungapped: LOG.warn("Found gaps, but sequences do not seem to be aligned." " Stats will be for ungapped seqs.") # guess type from first entry if guess_if_nucleic_acid(seqrecs[0].seq): seqtype = 'protein' else: seqtype = 'nucleic' print "Type (of 1st seq): %s" % (seqtype) print "Number of sequences: %d" % (nseqs) print "Smallest: %d" % (min(seqlens_ungapped)) print "Largest: %d" % (max(seqlens_ungapped)) print "Average length: %.1f" % (sum(seqlens_ungapped) / float(len(seqlens_ungapped))) #print "Format: %s" % (fmt) print "Aligned: %s" % ("yes" if aligned else "no") if aligned: # make sure to ignore self-comparison None's flat_pw_id_mx = [x for x in chain.from_iterable(pw_id_mx) if x] print "Alignment length: %d" % (aln_len) (mean, std) = meanstd(flat_pw_id_mx) print "Average identity: %0.2f" % (mean) print "Standard deviation: %0.2f" % (std) print "Most related pair: %0.2f" % (max(flat_pw_id_mx)) print "Most unrelated pair: %0.2f" % (min(flat_pw_id_mx)) if opts.info_for_all: # spacer print "" header = "# Name\tLength" if aligned: header += "\thigh-id to\tlow-id to" print header for (i, seqrec) in enumerate(seqrecs): line = "%s\t%d" % (seqrec.id, seqlens_ungapped[i]) if aligned: # construct list of pairwise ids from fake matrix. pw_ids = pw_id_mx[i] pw_ids.extend([pw_id_mx[j][i] for j in xrange(i + 1, nseqs)]) assert len(pw_ids) == nseqs, ( "len(pw_ids)=%d, but expected %d" % (len(pw_ids), nseqs)) # Find min and max and corresponding partner index, # but take care to ignore self-comparison value 'None' pw_ids[i] = -1.0 (pw_id_max_idx, pw_id_max_val) = argminmax(pw_ids, 'max') pw_ids[i] = 1.1 (pw_id_min_idx, pw_id_min_val) = argminmax(pw_ids, 'min') pw_ids[i] = None # reset even though not strictly necessary line += "\t%.4f\t%s\t%.4f\t%s" % ( pw_id_max_val, seqrecs[pw_id_max_idx].id, pw_id_min_val, seqrecs[pw_id_min_idx].id) print line print "%d names are unique and %d sequences are unique (including gaps)." % ( len(set([s.id for s in seqrecs])), len(set([str(s.seq) for s in seqrecs])))