def main(): parser = optparse.OptionParser(usage="usage: %prog [options] <file>", description=__doc__) parser.add_option("-r", "--reverse", action="store_true", dest="reverse", help="reverse the sequences, instead of shuffling") parser.add_option("-m", "--markov", type="int", dest="markov_length", help="generate Markov sequences, with memory of length" " N, instead of shuffling", default=None, metavar="N") parser.add_option("-s", "--seed", type="int", dest="seed", help="seed for randomness [default 0]", default=0, metavar="N") parser.add_option("-n", "--no-original", action="store_true", dest="no_original", help="don't output original sequences") parser.add_option("-v", "--verbose", action="store_true", dest="verbose", help="be verbose") DEFAULT_WRAP = 80 parser.add_option("-w", "--wrap", dest="wrap", type="int", default=DEFAULT_WRAP, help="wrap sequence to specified width" " [default %s, 0 means don't wrap at all]" % DEFAULT_WRAP, metavar="COLUMNS") parser.add_option("--copyright", action="store_true", dest="copyright", help="print copyright and exit") (options, args) = parser.parse_args() if (len(args) != 1 or options.markov_length != None and options.reverse or options.markov_length != None and options.markov_length < 0): parser.print_help() sys.exit(1) if options.markov_length != None: decoy_maker = markov_decoy_maker(options.seed, options.markov_length, args[0]) elif options.reverse: decoy_maker = reverse_decoy_maker() else: decoy_maker = shuffle_decoy_maker(options.seed) # locus id -> (defline, hash of sequence) seen = {} # real and decoy 6-mers seen sixmers = (set(), set()) for locusname, defline, sequence, filename \ in greylag.read_fasta_files([args[0]]): write_locus(options, decoy_maker, seen, sixmers, locusname, defline, sequence) common_sixmers = sixmers[0] & sixmers[1] print >> sys.stderr, ("six-mers: %s real %s decoy %s both" % (len(sixmers[0]) - len(common_sixmers), len(sixmers[1]) - len(common_sixmers), len(common_sixmers)))
def __init__(self, random_seed, length, original_sequence_file): random.seed(random_seed) self.length = length # for order 0 through self.length: # [ length-mer -> subsequent residues -> count, ... ] self.transition = [ defaultdict(lambda: defaultdict(int)) for i in range(length+1) ] for locusname, defline, sequence, filename \ in greylag.read_fasta_files([original_sequence_file]): for order in range(length+1): seq = '-' * order + sequence for i in xrange(len(sequence)): self.transition[order][seq[i:i+order]][seq[i+order]] += 1