def main(): # # defaults # file_name = None alphabet_file_name = None seed = 1 copies = 1 # # get command line arguments # usage = """USAGE: %s [options] -f <filename> file name (required) -t <tag> added to shuffled sequence names -s <seed> random seed; default: %d -c <n> make <n> shuffled copies of each sequence; default: %d -a <filename> alphabet file to use non-DNA alphabets -h print this usage message Note that fasta-shuffle-letters also supports dinucleotide shuffling and is faster. """ % (sys.argv[0], seed, copies) # no arguments: print usage if len(sys.argv) == 1: print >> sys.stderr, usage; sys.exit(1) tag = "" # parse command line i = 1 while i < len(sys.argv): arg = sys.argv[i] if (arg == "-f"): i += 1 try: file_name = sys.argv[i] except: print >> sys.stderr, usage; sys.exit(1) elif (arg == "-t"): i += 1 try: tag = sys.argv[i] except: print >> sys.stderr, usage; sys.exit(1) elif (arg == "-s"): i += 1 try: seed = string.atoi(sys.argv[i]) except: print >> sys.stderr, usage; sys.exit(1) elif (arg == "-c"): i += 1 try: copies = string.atoi(sys.argv[i]) except: print >> sys.stderr, usage; sys.exit(1) elif (arg == "-a"): i += 1 try: alphabet_file_name = sys.argv[i] except: print >> sys.stderr, usage; sys.exit(1) elif (arg == "-h"): print >> sys.stderr, usage; sys.exit(1) else: print >> sys.stderr, "Unknown command line argument: " + arg sys.exit(1) i += 1 # check that required arguments given if (file_name == None): print >> sys.stderr, usage; sys.exit(1) # get the alphabet, defaulting to DNA if it is not provided if alphabet_file_name != None: alph = alphabet.loadFromFile(alphabet_file_name) else: alph = alphabet.dna() random.seed(seed) # read sequences seqs = sequence.readFASTA(file_name, alph) for s in seqs: seq = s.getString() name = s.getName() for i in range(copies): shuffledSeq = dinuclShuffle(seq, alph) if (copies == 1): print >> sys.stdout, ">%s\n%s" % (name+tag, shuffledSeq) else: print >> sys.stdout, ">%s_%d\n%s" % (name+tag, i, shuffledSeq)
def main(): pos_seq_file_name = None # no positive sequence file specified neg_seq_file_name = None # no negative sequence file specified alphabet_file_name = None refine = False given_only = False # # get command line arguments # usage = """USAGE: %s [options] -w <word> word (required) -p <file_name> positive sequences FASTA file name (required) -n <file_name> negative sequences FASTA file name (required) -a <file_name> alphabet definition file -r refine consensus by branching search (distance 1 steps; beam size = 1). -h print this usage message Compute the Hamming distance from <word> to each FASTA sequence in the positive and negative files. Apply Fisher's Exact test to each distance. <word> may contain ambiguous characters. """ % (sys.argv[0]) # no arguments: print usage if len(sys.argv) == 1: print >> sys.stderr, usage; sys.exit(1) # parse command line i = 1 while i < len(sys.argv): arg = sys.argv[i] if (arg == "-w"): i += 1 try: word = sys.argv[i] except: print >> sys.stderr, usage; sys.exit(1) elif (arg == "-p"): i += 1 try: pos_seq_file_name = sys.argv[i] except: print >> sys.stderr, usage; sys.exit(1) elif (arg == "-n"): i += 1 try: neg_seq_file_name = sys.argv[i] except: print >> sys.stderr, usage; sys.exit(1) elif (arg == "-a"): i += 1 try: alphabet_file_name = sys.argv[i] except: print >> sys.stderr, usage; sys.exit(1) elif (arg == "-r"): try: refine = True except: print >> sys.stderr, usage; sys.exit(1) elif (arg == "-h"): print >> sys.stderr, usage; sys.exit(1) else: print >> sys.stderr, usage; sys.exit(1) i += 1 # check that required arguments given if (pos_seq_file_name == None or neg_seq_file_name == None): print >> sys.stderr, usage; sys.exit(1) # keep track of time start_time = time.time() # read alphabet alph = None if alphabet_file_name != None: alph = alphabet.loadFromFile(alphabet_file_name) else: alph = alphabet.dna() if (not alph.isComplementable()): given_only = True; print >> sys.stderr, "Alphabet is not complementable..." # read sequences print >> sys.stderr, "Reading sequences..." pos_seqs = get_strings_from_seqs(sequence.readFASTA(pos_seq_file_name, alph)) neg_seqs = get_strings_from_seqs(sequence.readFASTA(neg_seq_file_name, alph)) #print >> sys.stderr, "Computing Hamming enrichment..." #(dist, log_pvalue, p, P, n, N, aln) = get_best_hamming_alignment(word, pos_seqs, neg_seqs, alph, given_only) if refine: (best_word, best_log_pvalue) = refine_consensus(word, pos_seqs, neg_seqs, alph, given_only) else: best_word = word print >> sys.stderr, "Computing Hamming alignment..." (dist, log_pvalue, p, P, n, N, aln) = get_best_hamming_alignment(best_word, pos_seqs, neg_seqs, alph, given_only) pv_string = sprint_logx(log_pvalue, 1, _pv_format) nsites = len(aln) print >> sys.stderr, "[", p, P, n, N, dist, "]" print >> sys.stderr, "Best ZOOPs alignment has %d sites / %d at distance %d with p-value %s" % (nsites, P, dist, pv_string) print_meme_header(alph) print_meme_motif(best_word, nsites, pv_string, aln, alph) # print elapsed time end_time = time.time() elapsed = end_time - start_time print >> sys.stderr, "elapsed time: %.2f seconds" % elapsed print >> sys.stdout, "#elapsed time: %.2f seconds" % elapsed
def main(): pos_seq_file_name = None # no positive sequence file specified neg_seq_file_name = None # no negative sequence file specified alphabet_file_name = None refine = False given_only = False # # get command line arguments # usage = """USAGE: %s [options] -w <word> word (required) -p <file_name> positive sequences FASTA file name (required) -n <file_name> negative sequences FASTA file name (required) -a <file_name> alphabet definition file -r refine consensus by branching search (distance 1 steps; beam size = 1). -h print this usage message Compute the Hamming distance from <word> to each FASTA sequence in the positive and negative files. Apply Fisher's Exact test to each distance. <word> may contain ambiguous characters. """ % (sys.argv[0]) # no arguments: print usage if len(sys.argv) == 1: print >> sys.stderr, usage sys.exit(1) # parse command line i = 1 while i < len(sys.argv): arg = sys.argv[i] if (arg == "-w"): i += 1 try: word = sys.argv[i] except: print >> sys.stderr, usage sys.exit(1) elif (arg == "-p"): i += 1 try: pos_seq_file_name = sys.argv[i] except: print >> sys.stderr, usage sys.exit(1) elif (arg == "-n"): i += 1 try: neg_seq_file_name = sys.argv[i] except: print >> sys.stderr, usage sys.exit(1) elif (arg == "-a"): i += 1 try: alphabet_file_name = sys.argv[i] except: print >> sys.stderr, usage sys.exit(1) elif (arg == "-r"): try: refine = True except: print >> sys.stderr, usage sys.exit(1) elif (arg == "-h"): print >> sys.stderr, usage sys.exit(1) else: print >> sys.stderr, usage sys.exit(1) i += 1 # check that required arguments given if (pos_seq_file_name == None or neg_seq_file_name == None): print >> sys.stderr, usage sys.exit(1) # keep track of time start_time = time.time() # read alphabet alph = None if alphabet_file_name != None: alph = alphabet.loadFromFile(alphabet_file_name) else: alph = alphabet.dna() if (not alph.isComplementable()): given_only = True print >> sys.stderr, "Alphabet is not complementable..." # read sequences print >> sys.stderr, "Reading sequences..." pos_seqs = get_strings_from_seqs( sequence.readFASTA(pos_seq_file_name, alph)) neg_seqs = get_strings_from_seqs( sequence.readFASTA(neg_seq_file_name, alph)) #print >> sys.stderr, "Computing Hamming enrichment..." #(dist, log_pvalue, p, P, n, N, aln) = get_best_hamming_alignment(word, pos_seqs, neg_seqs, alph, given_only) if refine: (best_word, best_log_pvalue) = refine_consensus(word, pos_seqs, neg_seqs, alph, given_only) else: best_word = word print >> sys.stderr, "Computing Hamming alignment..." (dist, log_pvalue, p, P, n, N, aln) = get_best_hamming_alignment(best_word, pos_seqs, neg_seqs, alph, given_only) pv_string = sprint_logx(log_pvalue, 1, _pv_format) nsites = len(aln) print >> sys.stderr, "[", p, P, n, N, dist, "]" print >> sys.stderr, "Best ZOOPs alignment has %d sites / %d at distance %d with p-value %s" % ( nsites, P, dist, pv_string) print_meme_header(alph) print_meme_motif(best_word, nsites, pv_string, aln, alph) # print elapsed time end_time = time.time() elapsed = end_time - start_time print >> sys.stderr, "elapsed time: %.2f seconds" % elapsed print >> sys.stdout, "#elapsed time: %.2f seconds" % elapsed