def main():

    #
    # defaults
    #
    file_name = None
    alphabet_file_name = None
    seed = 1
    copies = 1

    #
    # get command line arguments
    #
    usage = """USAGE:
    %s [options]

    -f <filename>   file name (required)
    -t <tag>        added to shuffled sequence names
    -s <seed>       random seed; default: %d
    -c <n>          make <n> shuffled copies of each sequence; default: %d
    -a <filename>   alphabet file to use non-DNA alphabets
    -h              print this usage message

    Note that fasta-shuffle-letters also supports dinucleotide shuffling and is faster.
    """ % (sys.argv[0], seed, copies)

    # no arguments: print usage
    if len(sys.argv) == 1:
        print >> sys.stderr, usage; sys.exit(1)

    tag = ""

    # parse command line
    i = 1
    while i < len(sys.argv):
        arg = sys.argv[i]
        if (arg == "-f"):
            i += 1
            try: file_name = sys.argv[i]
            except: print >> sys.stderr, usage; sys.exit(1)
        elif (arg == "-t"):
            i += 1
            try: tag = sys.argv[i]
            except: print >> sys.stderr, usage; sys.exit(1)
        elif (arg == "-s"):
            i += 1
            try: seed = string.atoi(sys.argv[i])
            except: print >> sys.stderr, usage; sys.exit(1)
        elif (arg == "-c"):
            i += 1
            try: copies = string.atoi(sys.argv[i])
            except: print >> sys.stderr, usage; sys.exit(1)
        elif (arg == "-a"):
            i += 1
            try: alphabet_file_name = sys.argv[i]
            except: print >> sys.stderr, usage; sys.exit(1)
        elif (arg == "-h"):
            print >> sys.stderr, usage; sys.exit(1)
        else:
            print >> sys.stderr, "Unknown command line argument: " + arg
            sys.exit(1)
        i += 1

    # check that required arguments given
    if (file_name == None):
        print >> sys.stderr, usage; sys.exit(1)

    # get the alphabet, defaulting to DNA if it is not provided
    if alphabet_file_name != None:
        alph = alphabet.loadFromFile(alphabet_file_name)
    else:
        alph = alphabet.dna()

    random.seed(seed)

    # read sequences
    seqs = sequence.readFASTA(file_name, alph)

    for s in seqs:
        seq = s.getString()
        name = s.getName()
        for i in range(copies):
            shuffledSeq = dinuclShuffle(seq, alph)
            if (copies == 1):
                print >> sys.stdout, ">%s\n%s" % (name+tag, shuffledSeq)
            else:
                print >> sys.stdout, ">%s_%d\n%s" % (name+tag, i, shuffledSeq)
def main():

    pos_seq_file_name = None        # no positive sequence file specified
    neg_seq_file_name = None        # no negative sequence file specified
    alphabet_file_name = None
    refine = False
    given_only = False

    #
    # get command line arguments
    #
    usage = """USAGE:
    %s [options]

    -w <word>               word (required)
    -p <file_name>          positive sequences FASTA file name (required)
    -n <file_name>          negative sequences FASTA file name (required)
    -a <file_name>          alphabet definition file
    -r                      refine consensus by branching search
                            (distance 1 steps; beam size = 1).
    -h                      print this usage message

    Compute the Hamming distance from <word> to each FASTA sequence
    in the positive and negative files.  Apply Fisher's Exact test to
    each distance.
    <word> may contain ambiguous characters.

    """ % (sys.argv[0])

    # no arguments: print usage
    if len(sys.argv) == 1:
        print >> sys.stderr, usage; sys.exit(1)

    # parse command line
    i = 1
    while i < len(sys.argv):
        arg = sys.argv[i]
        if (arg == "-w"):
            i += 1
            try: word = sys.argv[i]
            except: print >> sys.stderr, usage; sys.exit(1)
        elif (arg == "-p"):
            i += 1
            try: pos_seq_file_name = sys.argv[i]
            except: print >> sys.stderr, usage; sys.exit(1)
        elif (arg == "-n"):
            i += 1
            try: neg_seq_file_name = sys.argv[i]
            except: print >> sys.stderr, usage; sys.exit(1)
        elif (arg == "-a"):
            i += 1
            try: alphabet_file_name = sys.argv[i]
            except: print >> sys.stderr, usage; sys.exit(1)
        elif (arg == "-r"):
            try: refine = True
            except: print >> sys.stderr, usage; sys.exit(1)
        elif (arg == "-h"):
            print >> sys.stderr, usage; sys.exit(1)
        else:
            print >> sys.stderr, usage; sys.exit(1)
        i += 1

    # check that required arguments given
    if (pos_seq_file_name == None or neg_seq_file_name == None):
        print >> sys.stderr, usage; sys.exit(1)

    # keep track of time
    start_time = time.time()

    # read alphabet
    alph = None
    if alphabet_file_name != None:
        alph = alphabet.loadFromFile(alphabet_file_name)
    else:
        alph = alphabet.dna()

    if (not alph.isComplementable()):
        given_only = True;
        print >> sys.stderr, "Alphabet is not complementable..."

    # read sequences
    print >> sys.stderr, "Reading sequences..."
    pos_seqs = get_strings_from_seqs(sequence.readFASTA(pos_seq_file_name, alph))
    neg_seqs = get_strings_from_seqs(sequence.readFASTA(neg_seq_file_name, alph))

    #print >> sys.stderr, "Computing Hamming enrichment..."
    #(dist, log_pvalue, p, P, n, N, aln) = get_best_hamming_alignment(word, pos_seqs, neg_seqs, alph, given_only)

    if refine:
        (best_word, best_log_pvalue) = refine_consensus(word, pos_seqs, neg_seqs, alph, given_only)
    else:
        best_word = word

    print >> sys.stderr, "Computing Hamming alignment..."
    (dist, log_pvalue, p, P, n, N, aln) = get_best_hamming_alignment(best_word, pos_seqs, neg_seqs, alph, given_only)
    pv_string = sprint_logx(log_pvalue, 1, _pv_format)
    nsites = len(aln)
    print >> sys.stderr, "[", p, P, n, N, dist, "]"
    print >> sys.stderr, "Best ZOOPs alignment has %d sites / %d at distance %d with p-value %s" % (nsites, P, dist, pv_string)
    print_meme_header(alph)
    print_meme_motif(best_word, nsites, pv_string, aln, alph)

    # print elapsed time
    end_time = time.time()
    elapsed = end_time - start_time
    print >> sys.stderr, "elapsed time: %.2f seconds" % elapsed
    print >> sys.stdout, "#elapsed time: %.2f seconds" % elapsed
Beispiel #3
0
def main():

    pos_seq_file_name = None  # no positive sequence file specified
    neg_seq_file_name = None  # no negative sequence file specified
    alphabet_file_name = None
    refine = False
    given_only = False

    #
    # get command line arguments
    #
    usage = """USAGE:
    %s [options]

    -w <word>               word (required)
    -p <file_name>          positive sequences FASTA file name (required)
    -n <file_name>          negative sequences FASTA file name (required)
    -a <file_name>          alphabet definition file
    -r                      refine consensus by branching search
                            (distance 1 steps; beam size = 1).
    -h                      print this usage message

    Compute the Hamming distance from <word> to each FASTA sequence
    in the positive and negative files.  Apply Fisher's Exact test to
    each distance.
    <word> may contain ambiguous characters.

    """ % (sys.argv[0])

    # no arguments: print usage
    if len(sys.argv) == 1:
        print >> sys.stderr, usage
        sys.exit(1)

    # parse command line
    i = 1
    while i < len(sys.argv):
        arg = sys.argv[i]
        if (arg == "-w"):
            i += 1
            try:
                word = sys.argv[i]
            except:
                print >> sys.stderr, usage
                sys.exit(1)
        elif (arg == "-p"):
            i += 1
            try:
                pos_seq_file_name = sys.argv[i]
            except:
                print >> sys.stderr, usage
                sys.exit(1)
        elif (arg == "-n"):
            i += 1
            try:
                neg_seq_file_name = sys.argv[i]
            except:
                print >> sys.stderr, usage
                sys.exit(1)
        elif (arg == "-a"):
            i += 1
            try:
                alphabet_file_name = sys.argv[i]
            except:
                print >> sys.stderr, usage
                sys.exit(1)
        elif (arg == "-r"):
            try:
                refine = True
            except:
                print >> sys.stderr, usage
                sys.exit(1)
        elif (arg == "-h"):
            print >> sys.stderr, usage
            sys.exit(1)
        else:
            print >> sys.stderr, usage
            sys.exit(1)
        i += 1

    # check that required arguments given
    if (pos_seq_file_name == None or neg_seq_file_name == None):
        print >> sys.stderr, usage
        sys.exit(1)

    # keep track of time
    start_time = time.time()

    # read alphabet
    alph = None
    if alphabet_file_name != None:
        alph = alphabet.loadFromFile(alphabet_file_name)
    else:
        alph = alphabet.dna()

    if (not alph.isComplementable()):
        given_only = True
        print >> sys.stderr, "Alphabet is not complementable..."

    # read sequences
    print >> sys.stderr, "Reading sequences..."
    pos_seqs = get_strings_from_seqs(
        sequence.readFASTA(pos_seq_file_name, alph))
    neg_seqs = get_strings_from_seqs(
        sequence.readFASTA(neg_seq_file_name, alph))

    #print >> sys.stderr, "Computing Hamming enrichment..."
    #(dist, log_pvalue, p, P, n, N, aln) = get_best_hamming_alignment(word, pos_seqs, neg_seqs, alph, given_only)

    if refine:
        (best_word,
         best_log_pvalue) = refine_consensus(word, pos_seqs, neg_seqs, alph,
                                             given_only)
    else:
        best_word = word

    print >> sys.stderr, "Computing Hamming alignment..."
    (dist, log_pvalue, p, P, n, N,
     aln) = get_best_hamming_alignment(best_word, pos_seqs, neg_seqs, alph,
                                       given_only)
    pv_string = sprint_logx(log_pvalue, 1, _pv_format)
    nsites = len(aln)
    print >> sys.stderr, "[", p, P, n, N, dist, "]"
    print >> sys.stderr, "Best ZOOPs alignment has %d sites / %d at distance %d with p-value %s" % (
        nsites, P, dist, pv_string)
    print_meme_header(alph)
    print_meme_motif(best_word, nsites, pv_string, aln, alph)

    # print elapsed time
    end_time = time.time()
    elapsed = end_time - start_time
    print >> sys.stderr, "elapsed time: %.2f seconds" % elapsed
    print >> sys.stdout, "#elapsed time: %.2f seconds" % elapsed