def dinuclShuffle(s, alph=alphabet.dna()): # check we can actually shuffle it if len(s) <= 2: return s # determine how to end the sequence edgeList = eulerian(s, alph) # turn the sequence into lists of following symbols symIList = computeList(s, alph) # remove last edges from each vertex list, shuffle, then add back # the removed edges at end of vertex lists. for [x, y] in edgeList: symIList[x].remove(y) for x in range(len(symIList)): random.shuffle(symIList[x]) for [x, y] in edgeList: symIList[x].append(y) #construct the eulerian path prevSymI = alph.getIndex(s[0]) L = [alph.getSymbol(prevSymI)] for i in range(len(s) - 2): symI = symIList[prevSymI].pop(0) L.append(alph.getSymbol(symI)) prevSymI = symI symI = alph.getIndex(s[-1]) L.append(alph.getSymbol(symI)) return "".join(L)
def dinuclShuffle(s, alph = alphabet.dna()): # check we can actually shuffle it if len(s) <= 2: return s # determine how to end the sequence edgeList = eulerian(s, alph) # turn the sequence into lists of following symbols symIList = computeList(s, alph) # remove last edges from each vertex list, shuffle, then add back # the removed edges at end of vertex lists. for [x,y] in edgeList: symIList[x].remove(y) for x in range(len(symIList)): random.shuffle(symIList[x]) for [x,y] in edgeList: symIList[x].append(y) #construct the eulerian path prevSymI = alph.getIndex(s[0]) L = [alph.getSymbol(prevSymI)] for i in range(len(s)-2): symI = symIList[prevSymI].pop(0) L.append(alph.getSymbol(symI)) prevSymI = symI symI = alph.getIndex(s[-1]) L.append(alph.getSymbol(symI)) return "".join(L)
def main(): pos_seq_file_name = None # no positive sequence file specified neg_seq_file_name = None # no negative sequence file specified alphabet_file_name = None refine = False given_only = False # # get command line arguments # usage = """USAGE: %s [options] -w <word> word (required) -p <file_name> positive sequences FASTA file name (required) -n <file_name> negative sequences FASTA file name (required) -a <file_name> alphabet definition file -r refine consensus by branching search (distance 1 steps; beam size = 1). -h print this usage message Compute the Hamming distance from <word> to each FASTA sequence in the positive and negative files. Apply Fisher's Exact test to each distance. <word> may contain ambiguous characters. """ % (sys.argv[0]) # no arguments: print usage if len(sys.argv) == 1: print(usage, file=sys.stderr) sys.exit(1) # parse command line i = 1 while i < len(sys.argv): arg = sys.argv[i] if (arg == "-w"): i += 1 try: word = sys.argv[i] except: print(usage, file=sys.stderr) sys.exit(1) elif (arg == "-p"): i += 1 try: pos_seq_file_name = sys.argv[i] except: print(usage, file=sys.stderr) sys.exit(1) elif (arg == "-n"): i += 1 try: neg_seq_file_name = sys.argv[i] except: print(usage, file=sys.stderr) sys.exit(1) elif (arg == "-a"): i += 1 try: alphabet_file_name = sys.argv[i] except: print(usage, file=sys.stderr) sys.exit(1) elif (arg == "-r"): try: refine = True except: print(usage, file=sys.stderr) sys.exit(1) elif (arg == "-h"): print(usage, file=sys.stderr) sys.exit(1) else: print(usage, file=sys.stderr) sys.exit(1) i += 1 # check that required arguments given if (pos_seq_file_name == None or neg_seq_file_name == None): print(usage, file=sys.stderr) sys.exit(1) # keep track of time start_time = time.time() # read alphabet alph = None if alphabet_file_name != None: alph = alphabet.loadFromFile(alphabet_file_name) else: alph = alphabet.dna() # read sequences print("Reading sequences...", file=sys.stderr) pos_seqs = get_strings_from_seqs( sequence.readFASTA(pos_seq_file_name, alph)) neg_seqs = get_strings_from_seqs( sequence.readFASTA(neg_seq_file_name, alph)) #print >> sys.stderr, "Computing Hamming enrichment..." #(dist, log_pvalue, p, P, n, N, aln) = get_best_hamming_alignment(word, pos_seqs, neg_seqs, alph, given_only) if refine: (best_word, best_log_pvalue) = refine_consensus(word, pos_seqs, neg_seqs, alph, given_only) else: best_word = word print("Computing Hamming alignment...", file=sys.stderr) (dist, log_pvalue, p, P, n, N, aln) = get_best_hamming_alignment(best_word, pos_seqs, neg_seqs, alph, given_only) pv_string = sprint_logx(log_pvalue, 1, _pv_format) nsites = len(aln) print("[", p, P, n, N, dist, "]", file=sys.stderr) print( "Best ZOOPs alignment has %d sites / %d at distance %d with p-value %s" % (nsites, P, dist, pv_string), file=sys.stderr) print_meme_header(alph) print_meme_motif(best_word, nsites, pv_string, aln, alph) # print elapsed time end_time = time.time() elapsed = end_time - start_time print("elapsed time: %.2f seconds" % elapsed, file=sys.stderr) print("#elapsed time: %.2f seconds" % elapsed, file=sys.stdout)
def main(): # # defaults # file_name = None alphabet_file_name = None seed = 1 copies = 1 # # get command line arguments # usage = """USAGE: %s [options] -f <filename> file name (required) -t <tag> added to shuffled sequence names -s <seed> random seed; default: %d -c <n> make <n> shuffled copies of each sequence; default: %d -a <filename> alphabet file to use non-DNA alphabets -h print this usage message Note that fasta-shuffle-letters also supports dinucleotide shuffling and is faster. """ % (sys.argv[0], seed, copies) # no arguments: print usage if len(sys.argv) == 1: print(usage, file=sys.stderr) sys.exit(1) tag = "" # parse command line i = 1 while i < len(sys.argv): arg = sys.argv[i] if (arg == "-f"): i += 1 try: file_name = sys.argv[i] except: print(usage, file=sys.stderr) sys.exit(1) elif (arg == "-t"): i += 1 try: tag = sys.argv[i] except: print(usage, file=sys.stderr) sys.exit(1) elif (arg == "-s"): i += 1 try: seed = int(sys.argv[i]) except: print(usage, file=sys.stderr) sys.exit(1) elif (arg == "-c"): i += 1 try: copies = int(sys.argv[i]) except: print(usage, file=sys.stderr) sys.exit(1) elif (arg == "-a"): i += 1 try: alphabet_file_name = sys.argv[i] except: print(usage, file=sys.stderr) sys.exit(1) elif (arg == "-h"): print(usage, file=sys.stderr) sys.exit(1) else: print("Unknown command line argument: " + arg, file=sys.stderr) sys.exit(1) i += 1 # check that required arguments given if (file_name == None): print(usage, file=sys.stderr) sys.exit(1) # get the alphabet, defaulting to DNA if it is not provided if alphabet_file_name != None: alph = alphabet.loadFromFile(alphabet_file_name) else: alph = alphabet.dna() random.seed(seed) # read sequences seqs = sequence.readFASTA(file_name, alph) for s in seqs: seq = s.getString() name = s.getName() for i in range(copies): shuffledSeq = dinuclShuffle(seq, alph) if (copies == 1): print(">%s\n%s" % (name + tag, shuffledSeq), file=sys.stdout) else: print(">%s_%d\n%s" % (name + tag, i, shuffledSeq), file=sys.stdout)
def main(): # # defaults # file_name = None alphabet_file_name = None seed = 1 copies = 1 # # get command line arguments # usage = """USAGE: %s [options] -f <filename> file name (required) -t <tag> added to shuffled sequence names -s <seed> random seed; default: %d -c <n> make <n> shuffled copies of each sequence; default: %d -a <filename> alphabet file to use non-DNA alphabets -h print this usage message Note that fasta-shuffle-letters also supports dinucleotide shuffling and is faster. """ % (sys.argv[0], seed, copies) # no arguments: print usage if len(sys.argv) == 1: print(usage, file=sys.stderr); sys.exit(1) tag = "" # parse command line i = 1 while i < len(sys.argv): arg = sys.argv[i] if (arg == "-f"): i += 1 try: file_name = sys.argv[i] except: print(usage, file=sys.stderr); sys.exit(1) elif (arg == "-t"): i += 1 try: tag = sys.argv[i] except: print(usage, file=sys.stderr); sys.exit(1) elif (arg == "-s"): i += 1 try: seed = int(sys.argv[i]) except: print(usage, file=sys.stderr); sys.exit(1) elif (arg == "-c"): i += 1 try: copies = int(sys.argv[i]) except: print(usage, file=sys.stderr); sys.exit(1) elif (arg == "-a"): i += 1 try: alphabet_file_name = sys.argv[i] except: print(usage, file=sys.stderr); sys.exit(1) elif (arg == "-h"): print(usage, file=sys.stderr); sys.exit(1) else: print("Unknown command line argument: " + arg, file=sys.stderr) sys.exit(1) i += 1 # check that required arguments given if (file_name == None): print(usage, file=sys.stderr); sys.exit(1) # get the alphabet, defaulting to DNA if it is not provided if alphabet_file_name != None: alph = alphabet.loadFromFile(alphabet_file_name) else: alph = alphabet.dna() random.seed(seed) # read sequences seqs = sequence.readFASTA(file_name, alph) for s in seqs: seq = s.getString() name = s.getName() for i in range(copies): shuffledSeq = dinuclShuffle(seq, alph) if (copies == 1): print(">%s\n%s" % (name+tag, shuffledSeq), file=sys.stdout) else: print(">%s_%d\n%s" % (name+tag, i, shuffledSeq), file=sys.stdout)
def main(): pos_seq_file_name = None # no positive sequence file specified neg_seq_file_name = None # no negative sequence file specified alphabet_file_name = None refine = False given_only = False # # get command line arguments # usage = """USAGE: %s [options] -w <word> word (required) -p <file_name> positive sequences FASTA file name (required) -n <file_name> negative sequences FASTA file name (required) -a <file_name> alphabet definition file -r refine consensus by branching search (distance 1 steps; beam size = 1). -h print this usage message Compute the Hamming distance from <word> to each FASTA sequence in the positive and negative files. Apply Fisher's Exact test to each distance. <word> may contain ambiguous characters. """ % (sys.argv[0]) # no arguments: print usage if len(sys.argv) == 1: print(usage, file=sys.stderr); sys.exit(1) # parse command line i = 1 while i < len(sys.argv): arg = sys.argv[i] if (arg == "-w"): i += 1 try: word = sys.argv[i] except: print(usage, file=sys.stderr); sys.exit(1) elif (arg == "-p"): i += 1 try: pos_seq_file_name = sys.argv[i] except: print(usage, file=sys.stderr); sys.exit(1) elif (arg == "-n"): i += 1 try: neg_seq_file_name = sys.argv[i] except: print(usage, file=sys.stderr); sys.exit(1) elif (arg == "-a"): i += 1 try: alphabet_file_name = sys.argv[i] except: print(usage, file=sys.stderr); sys.exit(1) elif (arg == "-r"): try: refine = True except: print(usage, file=sys.stderr); sys.exit(1) elif (arg == "-h"): print(usage, file=sys.stderr); sys.exit(1) else: print(usage, file=sys.stderr); sys.exit(1) i += 1 # check that required arguments given if (pos_seq_file_name == None or neg_seq_file_name == None): print(usage, file=sys.stderr); sys.exit(1) # keep track of time start_time = time.time() # read alphabet alph = None if alphabet_file_name != None: alph = alphabet.loadFromFile(alphabet_file_name) else: alph = alphabet.dna() # read sequences print("Reading sequences...", file=sys.stderr) pos_seqs = get_strings_from_seqs(sequence.readFASTA(pos_seq_file_name, alph)) neg_seqs = get_strings_from_seqs(sequence.readFASTA(neg_seq_file_name, alph)) #print >> sys.stderr, "Computing Hamming enrichment..." #(dist, log_pvalue, p, P, n, N, aln) = get_best_hamming_alignment(word, pos_seqs, neg_seqs, alph, given_only) if refine: (best_word, best_log_pvalue) = refine_consensus(word, pos_seqs, neg_seqs, alph, given_only) else: best_word = word print("Computing Hamming alignment...", file=sys.stderr) (dist, log_pvalue, p, P, n, N, aln) = get_best_hamming_alignment(best_word, pos_seqs, neg_seqs, alph, given_only) pv_string = sprint_logx(log_pvalue, 1, _pv_format) nsites = len(aln) print("[", p, P, n, N, dist, "]", file=sys.stderr) print("Best ZOOPs alignment has %d sites / %d at distance %d with p-value %s" % (nsites, P, dist, pv_string), file=sys.stderr) print_meme_header(alph) print_meme_motif(best_word, nsites, pv_string, aln, alph) # print elapsed time end_time = time.time() elapsed = end_time - start_time print("elapsed time: %.2f seconds" % elapsed, file=sys.stderr) print("#elapsed time: %.2f seconds" % elapsed, file=sys.stdout)