def main(): """ The main function """ parser = cmdline_parser() (opts, args) = parser.parse_args() if opts.verbose: LOG.setLevel(logging.INFO) if opts.debug: LOG.setLevel(logging.DEBUG) if not opts.aln_in: parser.error("Missing input alignment argument") sys.exit(1) what = None if opts.any_gap: assert not what, ("Can only do one operation at a time") what = 'any_gap' if opts.all_gap: assert not what, ("Can only do one operation at a time") what = 'all_gap' if opts.identical: assert not what, ("Can only do one operation at a time") what = 'identical' if not what: parser.error("No operation selected") sys.exit(1) if opts.aln_in == "-": fh_in = sys.stdin else: fh_in = open(opts.aln_in, "rU") fmt = opts.informat if not fmt: fmt = bioutils.guess_seqformat(opts.aln_in) aln = AlignIO.read(fh_in, fmt) if fh_in != sys.stdin: fh_in.close() prune_aln(aln, what, sys.stdout)
def main(): """ The main function """ parser = cmdline_parser() args = parser.parse_args() if args.verbose: LOG.setLevel(logging.INFO) if args.debug: LOG.setLevel(logging.DEBUG) if not args.aln_in: parser.error("Missing input alignment argument\n") sys.exit(1) #char_set = "ACGTU" #char_set = "ACDEFGHIKLMNPQRSTVWY" #x = any #z = Gln or Glu #b = Asp or Asn char_set = "ACGTN" char_set_ambig = "N" LOG.warn("using hardcoded charset %s" % char_set) # FIXME auto-detection of alphabet) if args.aln_in != "-" and not os.path.exists(args.aln_in): LOG.fatal("Input alignment %s does not exist.\n" % args.aln_in) sys.exit(1) if args.aln_in == "-": fh = sys.stdin fmt = 'fasta' else: fmt = guess_seqformat(args.aln_in) fh = open(args.aln_in, "rU") entropy_per_col = [] seqid_per_col = [] # note: had one case where this happily read an unaligned file!? aln = AlignIO.read(fh, fmt) # if requested, get sequence record for the sequence we should # positions to map_to_seq = None if args.map_to: map_to_seq = [rec.seq for rec in aln if rec.id == args.map_to] if not len(map_to_seq): LOG.fatal("Couldn't find a sequence called %s in %s" % ( args.map_to, fh.name)) sys.exit(1) elif len(map_to_seq)>1: LOG.fatal("Find more than one sequence with name %s in %s" % ( args.map_to, fh.name)) sys.exit(1) map_to_seq = map_to_seq[0] map_to_seq_cols = unaln_pos_map(map_to_seq) ign_idxs = [] if args.ign_seqs: for s in args.ign_seqs: found = False for (i, r) in enumerate([r.id for r in aln]): if r==s: ign_idxs.append(i) found = True break if not found: LOG.warn("No match for ignore sequence %s in alignment" % s) LOG.debug("ign_idxs = %s" % ign_idxs) for cidx in xrange(aln.get_alignment_length()): col = list(aln[:, cidx].upper()) # ignore chars as requested from ign_seqs for i in ign_idxs: del col[i] del i # replace unknown characters with ambiguity symbol unknown_chars = [] for (i, c) in enumerate(col): if c not in char_set and c not in GAP_CHARS: unknown_chars.append(c) col[i] = char_set_ambig elif c in GAP_CHARS: col[i] = "-" unknown_chars = set(unknown_chars) if len(unknown_chars): LOG.warn("Found unknown characters in col %d (%s) and replaced them with %c" % ( cidx+1, unknown_chars, char_set_ambig)) counter = Counter(col) denom = sum(counter.values()) if denom == 0: LOG.warn("No valid chars in col %d (col=%s)?" % (cidx+1, col)) #import pdb; pdb.set_trace() #raise ValueError entropy_per_col.append(-1) seqid_per_col.append(-1) else: vec = [] # count gaps for entropy for res in list(char_set) + ["-"]: vec.append(counter[res]/float(denom)) LOG.debug("vec=%s denom=%s counter=%s" % (vec, denom, counter)) entropy_per_col.append(shannon_entropy(vec)) seqid_per_col.append(seqid(counter)) # due to the fact that we keep all values (which is actually # not necessary but would come in handy if values were # precalculated) we cannot simply continue or there would be # some missing. 'continue/next' here if needed. if map_to_seq and map_to_seq[cidx] in GAP_CHARS: LOG.debug("Skipping col %d because map_to_seq has gap there." % (cidx+1)) continue counts_str = ' '.join( ["%s:%d" % (k,v) for (k,v) in sorted(counter.iteritems())]) if not map_to_seq: rep_col = cidx else: rep_col = map_to_seq_cols[cidx] print "%d %.6f %.6f %s" % ( rep_col+1 if not map_to_seq else map_to_seq_cols[cidx]+1, seqid_per_col[cidx], entropy_per_col[cidx], counts_str) if fh != sys.stdout: fh.close()
def main(): """ The main function """ parser = cmdline_parser() (opts, args) = parser.parse_args() if opts.verbose: LOG.setLevel(logging.INFO) if opts.debug: LOG.setLevel(logging.DEBUG) if len(args)<2: parser.error("Need pattern and at least one seqfile as argument") sys.exit(1) # first arg is pattern. rest are files pattern_arg = args[0] if opts.revcomp: pattern_arg = str(Seq(pattern_arg).reverse_complement()) LOG.info("Pattern after reverse complement: %s" % pattern_arg) seqfiles_arg = args[1:] LOG.debug("args=%s" % (args)) LOG.debug("pattern_arg=%s" % (pattern_arg)) LOG.debug("seqfiles_arg=%s" % (seqfiles_arg)) if opts.ignore_case: regexp = re.compile(pattern_arg, flags=re.IGNORECASE) else: regexp = re.compile(pattern_arg) for fseq in seqfiles_arg: if fseq != "-" and not os.path.exists(fseq): LOG.fatal("input file %s does not exist.\n" % fseq) sys.exit(1) print_file_prefix = False if len(seqfiles_arg)>1: print_file_prefix = True for fseq in seqfiles_arg: if fseq == "-": fhandle = sys.stdin else: if fseq[-3:] == ".gz": fhandle = gzip.open(fseq) else: fhandle = open(fseq, "rU") fmt = bioutils.guess_seqformat(fseq) if not fmt: fmt = 'fasta' LOG.info("Checking file %s (format %s)" % (fseq, fmt)) for record in SeqIO.parse(fhandle, fmt): #LOG.debug("checking seq %s (len %d)" % (record.id, len(record.seq))) if opts.search_in == 'seq': target = record.seq elif opts.search_in == 'id': # special case fasta: id is everything before the # first whitespace. description contains this as well. if fmt == 'fasta': target = record.description else: target = record.id else: raise ValueError, ( "internal error...not sure where to search in") target = str(target) match = regexp.search(target) print_match = False if match and not opts.invert_match: LOG.debug("match.string=%s" % match.string) print_match = True elif opts.invert_match and not match: print_match = True if print_match: #import pdb; pdb.set_trace() prefix = "" if print_file_prefix: prefix = fseq + ":" if fmt == 'fasta': print "%s>%s\n%s%s" % (prefix, record.description, prefix, record.seq) else: print "%s>%s\n%s%s" % (prefix, record.id, prefix, record.seq) if fhandle != sys.stdin: fhandle.close()
def main(): """ The main function """ parser = cmdline_parser() (opts, args) = parser.parse_args() if opts.verbose: LOG.setLevel(logging.INFO) if opts.debug: LOG.setLevel(logging.DEBUG) if not opts.aln_in: parser.error("Missing input alignment argument\n") sys.exit(1) if len(args): parser.error("Unrecognized arguments found: %s" % args) char_set = "ACGTU" char_set = "ACDEFGHIKLMNPQRSTVWY" #x = any #z = Gln or Glu #b = Asp or Asn char_set = "ACGTN" LOG.warn("using hardcoded charset %s" % char_set) # FIXME auto-detection of alphabet) if opts.aln_in != "-" and not os.path.exists(opts.aln_in): LOG.fatal("Input alignment %s does not exist.\n" % opts.aln_in) sys.exit(1) if opts.aln_in == "-": fh = sys.stdin fmt = 'fasta' else: fmt = bioutils.guess_seqformat(opts.aln_in) fh = open(opts.aln_in, "rU") entropy_per_col = [] seqid_per_col = [] # note: had one case where this happily read an unaligned file!? aln = AlignIO.read(fh, fmt) # if requested, get sequence record for the sequence we should # positions to map_to_seq = None if opts.map_to: map_to_seq = [rec.seq for rec in aln if rec.id == opts.map_to] if not len(map_to_seq): LOG.fatal("Couldn't find a sequence called %s in %s" % ( opts.map_to, fh.name)) sys.exit(1) elif len(map_to_seq)>1: LOG.fatal("Find more than one sequence with name %s in %s" % ( opts.map_to, fh.name)) sys.exit(1) map_to_seq = map_to_seq[0] map_to_seq_cols = unaln_pos_map(map_to_seq) for i in xrange(aln.get_alignment_length()): #col = aln.get_column(i) # deprecated col = get_aln_column(aln, i).upper() not_in_char_set = [c for c in col if c not in char_set] not_in_char_set = [c for c in not_in_char_set if c not in bioutils.GAP_CHARS] if len(not_in_char_set): LOG.warn("Found characters not in char_set (%s) in col %d (%s)" % ( char_set, i+1, set(not_in_char_set))) counter = Counter(col) vec = [] # this will ignore invalid chars incl. ambiguities denom = sum([counter[r] for r in char_set]) if denom == 0: LOG.fatal("denom = 0, means no valid chars in col %d?" % (i+1)) #import pdb; pdb.set_trace() raise ValueError for res in char_set: vec.append(counter[res]/float(denom)) LOG.debug("vec=%s denom=%s counter=%s" % (vec, denom, counter)) entropy_per_col.append(shannon_entropy(vec)) seqid_per_col.append(seqid(counter)) # due to the fact that we keep all values (which is actually # not necessary but would come in handy if values were # precalculated) we cannot simply continue or there would be # some missing. 'continue/next' here if needed. if map_to_seq and map_to_seq[i] in bioutils.GAP_CHARS: LOG.debug("Skipping col %d because map_to_seq has gap there." % (i+1)) continue counts_str = ' '.join( ["%s:%d" % (k,v) for (k,v) in sorted(counter.iteritems())]) if not map_to_seq: rep_col = i else: rep_col = map_to_seq_cols[i] print "%d %.6f %.6f %s" % ( rep_col+1 if not map_to_seq else map_to_seq_cols[i]+1, seqid_per_col[i], entropy_per_col[i], counts_str) if fh != sys.stdout: fh.close()
def main(): """ The main function """ parser = cmdline_parser() (opts, args) = parser.parse_args() if opts.verbose: LOG.setLevel(logging.INFO) if opts.debug: LOG.setLevel(logging.DEBUG) if len(args) != 1: parser.error("Need sequence file as input argument") sys.exit(1) fseq = args[0] if fseq == "-": fhandle = sys.stdin else: fhandle = open(fseq, "rU") fmt = opts.informat if not fmt: fmt = bioutils.guess_seqformat(fseq) seqrecs = [] seqlens = [] seqlens_ungapped = [] # read all into memory: makes id computation easier. the file # might come from stdin so we can't read twice for seqrec in SeqIO.parse(fhandle, fmt): seqrecs.append(seqrec) seqlens.append(len(seqrec.seq)) seqlens_ungapped.append(len(bioutils.ungap(str(seqrec.seq)))) if fhandle != sys.stdin: fhandle.close() nseqs = len(seqlens) if nseqs == 0: LOG.warn('No sequences found. Try changing the format (just tried: %s)' % fmt) sys.exit(0) aligned = False if nseqs > 1 and len(set(seqlens)) == 1: # add and len(set(seqlens_ungapped)) != 1 to make sure # unaligend sequence length are identical aligned = True aln_len = seqlens[0] # any will do as we know they're aligned pw_id_mx = comp_pairwise_ident_matrix(seqrecs) if not aligned and seqlens != seqlens_ungapped: LOG.warn("Found gaps, but sequences do not seem to be aligned." " Stats will be for ungapped seqs.") # guess type from first entry if guess_if_nucleic_acid(seqrecs[0].seq): seqtype = 'protein' else: seqtype = 'nucleic' print "Type (of 1st seq): %s" % (seqtype) print "Number of sequences: %d" % (nseqs) print "Smallest: %d" % ( min(seqlens_ungapped)) print "Largest: %d" % ( max(seqlens_ungapped)) print "Average length: %.1f" % ( sum(seqlens_ungapped)/float(len(seqlens_ungapped))) #print "Format: %s" % (fmt) print "Aligned: %s" % ("yes" if aligned else "no") if aligned: # make sure to ignore self-comparison None's flat_pw_id_mx = [x for x in chain.from_iterable(pw_id_mx) if x] print "Alignment length: %d" % (aln_len) (mean, std) = meanstd(flat_pw_id_mx) print "Average identity: %0.2f" % ( mean) print "Standard deviation: %0.2f" % ( std) print "Most related pair: %0.2f" % ( max(flat_pw_id_mx)) print "Most unrelated pair: %0.2f" % ( min(flat_pw_id_mx)) if opts.info_for_all: # spacer print "" header = "# Name\tLength" if aligned: header += "\thigh-id to\tlow-id to" print header for (i, seqrec) in enumerate(seqrecs): line = "%s\t%d" % ( seqrec.id, seqlens_ungapped[i]) if aligned: # construct list of pairwise ids from fake matrix. pw_ids = pw_id_mx[i] pw_ids.extend([pw_id_mx[j][i] for j in xrange(i+1, nseqs)]) assert len(pw_ids) == nseqs, ( "len(pw_ids)=%d, but expected %d" % (len(pw_ids), nseqs)) # Find min and max and corresponding partner index, # but take care to ignore self-comparison value 'None' pw_ids[i] = -1.0 (pw_id_max_idx, pw_id_max_val) = argminmax(pw_ids, 'max') pw_ids[i] = 1.1 (pw_id_min_idx, pw_id_min_val) = argminmax(pw_ids, 'min') pw_ids[i] = None # reset even though not strictly necessary line += "\t%.4f\t%s\t%.4f\t%s" % ( pw_id_max_val, seqrecs[pw_id_max_idx].id, pw_id_min_val, seqrecs[pw_id_min_idx].id) print line print "%d names are unique and %d sequences are unique (including gaps)." % ( len(set([s.id for s in seqrecs])), len(set([str(s.seq) for s in seqrecs])))
def main(): """ The main function """ parser = cmdline_parser() (opts, args) = parser.parse_args() if opts.verbose: LOG.setLevel(logging.INFO) if opts.debug: LOG.setLevel(logging.DEBUG) if len(args) != 1: parser.error("Need sequence file as input argument") sys.exit(1) fseq = args[0] if fseq == "-": fhandle = sys.stdin else: fhandle = open(fseq, "rU") fmt = opts.informat if not fmt: fmt = bioutils.guess_seqformat(fseq) seqrecs = [] seqlens = [] seqlens_ungapped = [] # read all into memory: makes id computation easier. the file # might come from stdin so we can't read twice for seqrec in SeqIO.parse(fhandle, fmt): seqrecs.append(seqrec) seqlens.append(len(seqrec.seq)) seqlens_ungapped.append(len(bioutils.ungap(str(seqrec.seq)))) if fhandle != sys.stdin: fhandle.close() nseqs = len(seqlens) if nseqs == 0: LOG.warn( 'No sequences found. Try changing the format (just tried: %s)' % fmt) sys.exit(0) aligned = False if nseqs > 1 and len(set(seqlens)) == 1: # add and len(set(seqlens_ungapped)) != 1 to make sure # unaligend sequence length are identical aligned = True aln_len = seqlens[0] # any will do as we know they're aligned pw_id_mx = comp_pairwise_ident_matrix(seqrecs) if not aligned and seqlens != seqlens_ungapped: LOG.warn("Found gaps, but sequences do not seem to be aligned." " Stats will be for ungapped seqs.") # guess type from first entry if guess_if_nucleic_acid(seqrecs[0].seq): seqtype = 'protein' else: seqtype = 'nucleic' print "Type (of 1st seq): %s" % (seqtype) print "Number of sequences: %d" % (nseqs) print "Smallest: %d" % (min(seqlens_ungapped)) print "Largest: %d" % (max(seqlens_ungapped)) print "Average length: %.1f" % (sum(seqlens_ungapped) / float(len(seqlens_ungapped))) #print "Format: %s" % (fmt) print "Aligned: %s" % ("yes" if aligned else "no") if aligned: # make sure to ignore self-comparison None's flat_pw_id_mx = [x for x in chain.from_iterable(pw_id_mx) if x] print "Alignment length: %d" % (aln_len) (mean, std) = meanstd(flat_pw_id_mx) print "Average identity: %0.2f" % (mean) print "Standard deviation: %0.2f" % (std) print "Most related pair: %0.2f" % (max(flat_pw_id_mx)) print "Most unrelated pair: %0.2f" % (min(flat_pw_id_mx)) if opts.info_for_all: # spacer print "" header = "# Name\tLength" if aligned: header += "\thigh-id to\tlow-id to" print header for (i, seqrec) in enumerate(seqrecs): line = "%s\t%d" % (seqrec.id, seqlens_ungapped[i]) if aligned: # construct list of pairwise ids from fake matrix. pw_ids = pw_id_mx[i] pw_ids.extend([pw_id_mx[j][i] for j in xrange(i + 1, nseqs)]) assert len(pw_ids) == nseqs, ( "len(pw_ids)=%d, but expected %d" % (len(pw_ids), nseqs)) # Find min and max and corresponding partner index, # but take care to ignore self-comparison value 'None' pw_ids[i] = -1.0 (pw_id_max_idx, pw_id_max_val) = argminmax(pw_ids, 'max') pw_ids[i] = 1.1 (pw_id_min_idx, pw_id_min_val) = argminmax(pw_ids, 'min') pw_ids[i] = None # reset even though not strictly necessary line += "\t%.4f\t%s\t%.4f\t%s" % ( pw_id_max_val, seqrecs[pw_id_max_idx].id, pw_id_min_val, seqrecs[pw_id_min_idx].id) print line print "%d names are unique and %d sequences are unique (including gaps)." % ( len(set([s.id for s in seqrecs])), len(set([str(s.seq) for s in seqrecs])))
def main(): """ The main function """ parser = cmdline_parser() (opts, args) = parser.parse_args() if opts.verbose: LOG.setLevel(logging.INFO) if opts.debug: LOG.setLevel(logging.DEBUG) if len(args) != 0: parser.error("Unrecognized args found") sys.exit(1) for (f, d) in [(opts.pw_aln, "Pairwise alignment"), (opts.ref_gb, "Reference Genbank")]: if not f: parser.error("Missing %s argument" % d) if not os.path.exists(f): LOG.fatal("%s file '%s' does not exist" % (d, f)) sys.exit(1) refseq = list(SeqIO.parse(opts.ref_gb, "genbank")) assert len(refseq)==1 refseq = refseq[0] pw_aln = list(SeqIO.parse(opts.pw_aln, bioutils.guess_seqformat(opts.pw_aln))) assert len(pw_aln)==2, ( "Was expecting two sequences, but parsed %d from %s" % ( len(pw_aln), opts.pw_aln)) # determine ref id # # seqids in alignment should match genbank id but might not matches = difflib.get_close_matches(refseq.id, [s.id for s in pw_aln]) assert len(matches), ( "Couldn't find a sensible match between sequence ids in alignment and genbank") aln_ref_id = matches[0] if aln_ref_id != refseq.id: LOG.warn("Assuming %s (from alignment) is the same as %s (from Genbank)" % ( aln_ref_id, refseq.id)) LOG.info("%s is the ref id" % (aln_ref_id)) # determine query id assert len(pw_aln) == 2 for sid in [s.id for s in pw_aln]: if sid != aln_ref_id: query_id = sid LOG.info("%s is the query id" % (query_id)) pos_map = PosMap(pw_aln) #pos_map.output() ref_to_query_map = pos_map.convert(aln_ref_id, query_id) print "#QUERY-POS (%s)\tREF-POS (%s)\tSTRAND\tTYPE\tQUALIFIERS" % ( query_id, aln_ref_id) for feat in refseq.features: query_pos_str = "%d-%d" % ( ref_to_query_map[feat.location.start.position+1], ref_to_query_map[feat.location.end.position]) orig_pos_str = "%d-%d" % ( feat.location.start.position+1, feat.location.end.position) strand_str = "%s" % feat.strand type_str = "%s" % feat.type qualifiers_str = '; '.join("%s %s" % (k, ', '.join(v)) for (k, v) in feat.qualifiers.iteritems() if k != 'translation') print '\t'.join([query_pos_str, orig_pos_str, strand_str, type_str, qualifiers_str]) LOG.info("Note: feature positions might overlap")
def main(): """ The main function """ parser = cmdline_parser() (opts, args) = parser.parse_args() if opts.verbose: LOG.setLevel(logging.INFO) if opts.debug: LOG.setLevel(logging.DEBUG) if len(args) != 0: parser.error("Unrecognized args found") sys.exit(1) for (f, d) in [(opts.pw_aln, "Pairwise alignment"), (opts.ref_gb, "Reference Genbank")]: if not f: parser.error("Missing %s argument" % d) if not os.path.exists(f): LOG.fatal("%s file '%s' does not exist" % (d, f)) sys.exit(1) refseq = list(SeqIO.parse(opts.ref_gb, "genbank")) assert len(refseq) == 1 refseq = refseq[0] pw_aln = list( SeqIO.parse(opts.pw_aln, bioutils.guess_seqformat(opts.pw_aln))) assert len(pw_aln) == 2, ( "Was expecting two sequences, but parsed %d from %s" % (len(pw_aln), opts.pw_aln)) # determine ref id # # seqids in alignment should match genbank id but might not matches = difflib.get_close_matches(refseq.id, [s.id for s in pw_aln]) assert len(matches), ( "Couldn't find a sensible match between sequence ids in alignment and genbank" ) aln_ref_id = matches[0] if aln_ref_id != refseq.id: LOG.warn( "Assuming %s (from alignment) is the same as %s (from Genbank)" % (aln_ref_id, refseq.id)) LOG.info("%s is the ref id" % (aln_ref_id)) # determine query id assert len(pw_aln) == 2 for sid in [s.id for s in pw_aln]: if sid != aln_ref_id: query_id = sid LOG.info("%s is the query id" % (query_id)) pos_map = PosMap(pw_aln) #pos_map.output() ref_to_query_map = pos_map.convert(aln_ref_id, query_id) print "#QUERY-POS (%s)\tREF-POS (%s)\tSTRAND\tTYPE\tQUALIFIERS" % ( query_id, aln_ref_id) for feat in refseq.features: query_pos_str = "%d-%d" % ( ref_to_query_map[feat.location.start.position + 1], ref_to_query_map[feat.location.end.position]) orig_pos_str = "%d-%d" % (feat.location.start.position + 1, feat.location.end.position) strand_str = "%s" % feat.strand type_str = "%s" % feat.type qualifiers_str = '; '.join("%s %s" % (k, ', '.join(v)) for (k, v) in feat.qualifiers.iteritems() if k != 'translation') print '\t'.join([ query_pos_str, orig_pos_str, strand_str, type_str, qualifiers_str ]) LOG.info("Note: feature positions might overlap")