Esempio n. 1
0
def posterior_decoding():
    """Solution code below..."""
    
    #print "Enter the name of the HMM file:"
    #hmm_file = raw_input().strip()
    hmm_file = "HMMmethanococcus.txt"
    
    #print "Enter the name of the input file:"
    #sys.stdout.flush()
    #input_file = raw_input().strip()

    input_file = "bacterial.genome.fasta"
    f_in_file = open(input_file)
    f_hmm_file = open(hmm_file)
    
    if f_in_file is None:
        sys.exit("Can't open HMM file: " + hmm_file)
    if f_hmm_file is None:
        sys.exit("Can't open file: " + input_file)
    
    # read the state names
    states = f_hmm_file.readline().split()
    
    # read the initial probabilities
    probs = f_hmm_file.readline().split()
    initial_probs = [float(prob) for prob in probs]
    
    # read the transition matrix
    transitions = [None for _ in range(len(states))]
    for i in range(0, len(states)):
        matrix_row_arry = f_hmm_file.readline().split()
        matrix_row = [float(trans_prob) for trans_prob in matrix_row_arry]
        transitions[i] = matrix_row

    # read the emitted symbols
    emitted_symbols = f_hmm_file.readline().split()

    # read the emission probability matrix
    emit_probs = [None for _ in range(len(states))]
    for i in range(0, len(states)):
        matrix_row_arry = f_hmm_file.readline().split()
        matrix_row = [float(emit_prob) for emit_prob in matrix_row_arry]
        emit_probs[i] = matrix_row

    f_hmm_file.close()
    
    seq_dict = get_fasta_dict(input_file)
    emit_str = seq_dict.values()[0]  #there's only 1
    test = len(emit_str)
    print "Done reading sequence of length " , str(test) 
    
    print "\n.... Done reading sequence of length " + str(test) 
    
    #initializing the forward, backward and posterior 2D matrices
    forward = [[float(0) for _ in range(len(states))] for _ in range(len(emit_str))]
    backward = [[float(0) for _ in range(len(states))] for _ in range(len(emit_str))]
    posterior = [[float(0) for _ in range(len(states))] for _ in range(len(emit_str))]
    
    # Run the forward algorithm
    run_forward(states, initial_probs, transitions, emitted_symbols, emit_probs, emit_str, forward)
    
    # Run the backward algorithm
    run_backward(states, initial_probs, transitions, emitted_symbols, emit_probs, emit_str, backward)

    # Calculate the posterior probabilities
    for i in range(0, len(emit_str)):
        # Did not normalize the probabilities (i.e., did not divide by P(X)),
        # because we will only use these probability to compare
        # posterior[i][0] versus posterior[i][1].   
        for k in range(0, len(states)):
            posterior[i][k] = forward[i][k] + backward[i][k]
    
    # Print the decoded results
    best_path = ""
    for probabilities in posterior:
        if probabilities[0] > probabilities[1]: 
            best_path += "0"
        else:
            best_path += "1"
            
    print "The best path is " + best_path
    print
    print "Start    Stop    State"
    state0 = True
    start = 0
    end = 0
    number_regions = 0
    temporary_list = []
    for i in range(len(best_path)):
        if (best_path[i] == "1" and state0 == True):
            temporary_tuple = (start,i,"1",i-start)
            print start, "\t", i, "\t", "1"
            temporary_list.append(temporary_tuple)
            start = i
            state0 = False
        if (best_path[i] == "0" and state0 == False):
            state = "2"
            temporary_tuple = (start,i,"2",i-start)
            print start, "\t", i, "\t", state
            temporary_list.append(temporary_tuple)
            start = i
            state0 = True
            number_regions += 1

    print "There are", number_regions, "structural RNA regions"
def posterior_decoding(input_file, f_hmm_file):
    # read the state names and number
    states = f_hmm_file.readline().split()
    K = len(states)

    # read the initial probabilities
    probs = f_hmm_file.readline().split()
    initial_probs = [float(prob) for prob in probs]

    # read the transition matrix
    transitions = [None for _ in range(K)]
    for i in range(K):
        matrix_row_arry = f_hmm_file.readline().split()
        matrix_row = [float(trans_prob) for trans_prob in matrix_row_arry]
        transitions[i] = matrix_row

    # read the emission symbols
    emission_symbols = f_hmm_file.readline().split()

    # read the emission probability matrix
    emit_probs = [None for _ in range(K)]
    for i in range(K):
        matrix_row_arry = f_hmm_file.readline().split()
        matrix_row = [float(emit_prob) for emit_prob in matrix_row_arry]
        emit_probs[i] = matrix_row

    f_hmm_file.close()

    seq_dict = get_fasta_dict(input_file)
    emit_str = seq_dict.values()[0]  # there's only 1

    print "Done reading sequence of length ", len(emit_str)
    for a in range(len(emit_probs)):
        for c in range(len(transitions[0])):
            transitions[a][c] = log(transitions[a][c])
        for b in range(len(emit_probs[0])):
            emit_probs[a][b] = log(emit_probs[a][b])

    # Run the forward algorithm
    forward = run_forward(states, initial_probs, transitions, emission_symbols,
                          emit_probs, emit_str)

    # Run the backward algorithm
    backward = run_backward(states, initial_probs, transitions,
                            emission_symbols, emit_probs, emit_str)

    # Calculate the posterior probabilities
    # Initializing the posterior 2D matrices
    posterior = [[float(0) for _ in range(K)] for _ in range(len(emit_str))]
    for i in range(len(emit_str)):
        # Did not normalize the probabilities (i.e., did not divide by P(X)),
        # because we will only use these probabilities to compare
        # posterior[i][0] versus posterior[i][1]
        for k in range(K):
            posterior[i][k] = forward[i][k] + backward[i][k]

    # Print out the decoded results
    print "start\tstop\tstate"
    total_count = 0
    state2_count = 0
    curr_state = 0
    start = 0
    for i in range(len(posterior)):
        max = float('-inf')
        max_state = 0
        for k in range(K):
            if (posterior[i][k] > max):
                max = posterior[i][k]
                max_state = k
        if (max_state != curr_state or i == len(posterior) - 1):
            total_count += 1
            if curr_state == 1:
                state2_count += 1
            if i == len(posterior) - 1:
                print start + 1, "\t", i + 1, "\tstate", curr_state + 1
            else:
                print start + 1, "\t", i, "\tstate", curr_state + 1
            start = i
        curr_state = max_state

    print "Total number of regions reported:", total_count
    print "Total number of state 2 regions:", state2_count
Esempio n. 3
0
def posterior_decoding(input_file, hmm_file):
    """
    Calculate the posterior decoding and return the decoded segments.

    input_file (str): path to input fasta file
    hmm_file (str): path to HMM file

    Returns:
        A list of dictionaries of segments in each state. An example output may
        look like:

        [
            {‘start’: 0, ‘end’: 12, ‘state’: ‘state2’},
            {‘start’: 13, ‘end’: 20, ‘state’: ‘state1’},
            ...
        ]
    """

    # Read in the input files
    f_in_file = open(input_file)
    f_hmm_file = open(hmm_file)
    if f_in_file is None: sys.exit("Can't open HMM file: " + hmm_file)
    if f_hmm_file is None: sys.exit("Can't open file: " + input_file)

    # read the state names and number
    states = f_hmm_file.readline().split()
    K = len(states)

    # read the initial probabilities
    probs = f_hmm_file.readline().split()
    initial_probs = [float(prob) for prob in probs]

    # read the transition matrix
    transitions = [None for _ in range(K)]
    for i in range(K):
        matrix_row_arry = f_hmm_file.readline().split()
        matrix_row = [float(trans_prob) for trans_prob in matrix_row_arry]
        transitions[i] = matrix_row

    # read the emission symbols
    emission_symbols = f_hmm_file.readline().split()

    # read the emission probability matrix
    emit_probs = [None for _ in range(K)]
    for i in range(K):
        matrix_row_arry = f_hmm_file.readline().split()
        matrix_row = [float(emit_prob) for emit_prob in matrix_row_arry]
        emit_probs[i] = matrix_row

    f_hmm_file.close()

    seq_dict = get_fasta_dict(input_file)
    emit_str = list(seq_dict.values())[0]  # there's only 1

    print(("Done reading sequence of length ", len(emit_str)))

    # Run the forward algorithm
    forward = run_forward(states, initial_probs, transitions,
                          emission_symbols, emit_probs, emit_str)

    # Run the backward algorithm
    backward = run_backward(states, initial_probs,
                            transitions, emission_symbols, emit_probs,
                            emit_str)

    # Calculate the posterior probabilities
    # Initializing the posterior 2D matrices
    posterior = [[float(0) for _ in range(K)] for _ in range(len(emit_str))]
    for i in range(len(emit_str)):
        # Did not normalize the probabilities (i.e., did not divide by P(X)),
        # because we will only use these probabilities to compare
        # posterior[i][0] versus posterior[i][1]
        for k in range(K):
            posterior[i][k] = forward[i][k] + backward[i][k]

    # Create the list of decoded segments to return
    """YOUR CODE HERE"""

    return []
def posterior_decoding(f_in_file, f_hmm_file):
    # read the state names and number
    states = f_hmm_file.readline().split()
    K = len(states)

    # read the initial probabilities
    probs = f_hmm_file.readline().split()
    initial_probs = [float(prob) for prob in probs]

    # read the transition matrix
    transitions = [None for _ in range(K)]
    for i in range(K):
        matrix_row_arry = f_hmm_file.readline().split()
        matrix_row = [float(trans_prob) for trans_prob in matrix_row_arry]
        transitions[i] = matrix_row

    # read the emission symbols
    emission_symbols = f_hmm_file.readline().split()

    # read the emission probability matrix
    emit_probs = [None for _ in range(K)]
    for i in range(K):
        matrix_row_arry = f_hmm_file.readline().split()
        matrix_row = [float(emit_prob) for emit_prob in matrix_row_arry]
        emit_probs[i] = matrix_row

    f_hmm_file.close()

    seq_dict = get_fasta_dict(input_file)
    emit_str = seq_dict.values()[0]  # there's only 1

    print "Done reading sequence of length ", len(emit_str)

    # Run the forward algorithm
    forward = run_forward(states, initial_probs, transitions, emission_symbols,
                          emit_probs, emit_str)

    # Run the backward algorithm
    backward = run_backward(states, initial_probs, transitions,
                            emission_symbols, emit_probs, emit_str)

    # Calculate the posterior probabilities
    # Initializing the posterior 2D matrices
    posterior = [[float(0) for _ in range(K)] for _ in range(len(emit_str))]
    for i in range(len(emit_str)):
        # Did not normalize the probabilities (i.e., did not divide by P(X)),
        # because we will only use these probabilities to compare
        # posterior[i][0] versus posterior[i][1]
        for k in range(K):
            posterior[i][k] = forward[i][k] + backward[i][k]

    # Print out the decoded results

#     for post in posterior:
#         max_p
#         for k in range(K):
    trace = []
    for post in posterior:

        max_post = float("-inf")
        max_state = 0
        for k in range(K):
            if post[k] > max_post:
                max_post = post[k]
                max_state = k
        trace.append(max_state)


#     print trace[1627030:1627050]
#     print trace [1627041-1:1627042+2]

#     display_trix(trace)
    catch_top_ten(trace)
    catch_bottom_ten(trace)
Esempio n. 5
0
def posterior_decoding():
    hmm_file = raw_input("Enter the name of the HMM file:").strip()
    sys.stdout.flush()
    input_file = raw_input("Enter the name of the input file:").strip()
    
    f_in_file = open(input_file)
    f_hmm_file = open(hmm_file)
    
    if f_in_file is None:
        sys.exit("Can't open HMM file: " + hmm_file)
    if f_hmm_file is None:
        sys.exit("Can't open file: " + input_file)
    
    # read the state names and number
    states = f_hmm_file.readline().split()
    K = len(states)
    
    # read the initial probabilities
    probs = f_hmm_file.readline().split()
    initial_probs = [log(float(prob)) for prob in probs]
        
    # read the transition matrix
    transitions = [None for _ in range(K)]
    for i in range(K):
        matrix_row_arry = f_hmm_file.readline().split()
        matrix_row = [log(float(trans_prob)) for trans_prob in matrix_row_arry]
        transitions[i] = matrix_row
        
    # read the emitted symbols
    emitted_symbols = f_hmm_file.readline().split()
    
    # read the emission probability matrix
    emit_probs = [None for _ in range(K)]
    for i in range(K):
        matrix_row_arry = f_hmm_file.readline().split()
        matrix_row = [log(float(emit_prob)) for emit_prob in matrix_row_arry]
        emit_probs[i] = matrix_row
    
    f_hmm_file.close()
    
    seq_dict = get_fasta_dict(input_file)
    emit_str = seq_dict.values()[0]  # there's only 1
    
    print "Done reading sequence of length ", len(emit_str)
    
    # Run the forward algorithm
    forward = run_forward(states, initial_probs, transitions, emitted_symbols, emit_probs, emit_str)
    
    # Run the backward algorithm
    backward = run_backward(states, initial_probs, transitions, emitted_symbols, emit_probs, emit_str)
    
    # Calculate the posterior probabilities
    # Initializing the posterior 2D matrices
    posterior = [[float(0) for _ in range(K)] for _ in range(len(emit_str))]
    for i in range(len(emit_str)):
        # Did not normalize the probabilities (i.e., did not divide by P(X)),
        # because we will only use these probabilities to compare
        # posterior[i][0] versus posterior[i][1]
        for k in range(K):
            posterior[i][k] = forward[i][k] + backward[i][k]
    
    # Print the decoded results
    printout = []
    
    for i in range(len(posterior)):
        max_at_index = float('-inf')
        max_k = float('-inf')
        for j in range(len(posterior[i])):
            if posterior[i][j] > max_at_index:
                max_at_index = posterior[i][j]
                max_k = j
        
        printout.append(max_k)
    
    # You could print it out, but it's very long:
    # print "".join([str(x) for x in printout])
    
    prev_state = -1
    prev_state_index = - 1
    
    print
    print "start\tstop\tstate"
    
    for i in range(len(printout)):
        if i == 0:
            prev_state = printout[i]
            prev_state_index = 0
        else:
            if prev_state != printout[i]:
                print str(prev_state_index+1) + "\t" + str(i) + "\t" + "state " + str(prev_state+1) 
                prev_state = printout[i]
                prev_state_index = i 
    
    # print the last sequence here!
    if prev_state_index < len(printout) - 1:
        print str(prev_state_index+1) + "\t" + str(len(printout)) + "\t" + "state " + str(prev_state+1)
Esempio n. 6
0
def posterior_decoding():
    hmm_file = raw_input("Enter the name of the HMM file:").strip()
    sys.stdout.flush()
    input_file = raw_input("Enter the name of the input file:").strip()

    f_in_file = open(input_file)
    f_hmm_file = open(hmm_file)

    if f_in_file is None:
        sys.exit("Can't open HMM file: " + hmm_file)
    if f_hmm_file is None:
        sys.exit("Can't open file: " + input_file)

    # read the state names and number
    states = f_hmm_file.readline().split()
    K = len(states)
    #print("Our states are: " + str(states))
    #print("The number K of states is " + str(K))

    # read the initial probabilities
    probs = f_hmm_file.readline().split()
    initial_probs = [float(prob) for prob in probs]
    #print("The initial probability of each of our states is " + str(initial_probs))

    # read the transition matrix
    transitions = [None for _ in range(K)]
    for i in range(K):
        matrix_row_arry = f_hmm_file.readline().split()
        matrix_row = [float(trans_prob) for trans_prob in matrix_row_arry]
        transitions[i] = matrix_row
    #print("Our transition matrix between states is " + str(transitions))

    # read the emission symbols
    emission_symbols = f_hmm_file.readline().split()
    #print("Our options for emitted symbols are " + str(emission_symbols))

    # read the emission probability matrix
    emit_probs = [None for _ in range(K)]
    for i in range(K):
        matrix_row_arry = f_hmm_file.readline().split()
        matrix_row = [float(emit_prob) for emit_prob in matrix_row_arry]
        emit_probs[i] = matrix_row
    #print("Our emission probabilities are " + str(emit_probs))

    f_hmm_file.close()

    seq_dict = get_fasta_dict(input_file)
    emit_str = seq_dict.values()[0]  # there's only 1

    print "Done reading sequence of length ", len(emit_str)

    # Run the forward algorithm
    forward = run_forward(states, initial_probs, transitions, emission_symbols,
                          emit_probs, emit_str)

    # Run the backward algorithm
    backward = run_backward(states, initial_probs, transitions,
                            emission_symbols, emit_probs, emit_str)

    # Calculate the posterior probabilities
    # Initializing the posterior 2D matrices
    posterior = [[float(0) for _ in range(K)] for _ in range(len(emit_str))]
    for i in range(len(emit_str)):
        # Did not normalize the probabilities (i.e., did not divide by P(X)),
        # because we will only use these probabilities to compare
        # posterior[i][0] versus posterior[i][1]
        for k in range(K):
            posterior[i][k] = forward[i][k] + backward[i][k]

    # Print out the decoded results
    #print("Final posterior matrix is " + str(posterior))

    #Initialize an output path
    path = ""

    #Iterate in reverse order through the characters of our string
    for i in range(len(emit_str) - 1, -1, -1):
        #Initialize a maximum probability and a state index
        maxProb = float('-inf')
        stateIndex = 0
        #Iterate through the potential states for this character
        for k in range(K):
            #Find the maximum viterbi value, and based off of this value, get the corresponding state
            #print("Checking row " + str(k))
            if (posterior[i][k] > maxProb):
                maxProb = posterior[i][k]
                stateIndex = k

        path = path + str(stateIndex + 1)
        #print("Adding " + str(stateIndex+1))

        #For the first value in our output path, save out an additional state correspodning the row of the viterbi value we chose

    #print(path)
    #print(emit_str)
    #print(path[::-1])

    #Return and print our output in terms of the format requested (see formatPath method)
    output = formatPath(path[::-1], states)
    print("Output is " + str(output))
    return output
Esempio n. 7
0
def posterior_decoding():
    hmm_file = raw_input("Enter the name of the HMM file:").strip()
    sys.stdout.flush()
    input_file = raw_input("Enter the name of the input file:").strip()

    f_in_file = open(input_file)
    f_hmm_file = open(hmm_file)
    
    if f_in_file is None:
        sys.exit("Can't open HMM file: " + hmm_file)
    if f_hmm_file is None:
        sys.exit("Can't open file: " + input_file)
    
    # read the state names and number
    states = f_hmm_file.readline().split()
    K = len(states)
    
    # read the initial probabilities
    probs = f_hmm_file.readline().split()
    initial_probs = [float(prob) for prob in probs]
    
    # read the transition matrix
    transitions = [None for _ in range(K)]
    for i in range(K):
        matrix_row_arry = f_hmm_file.readline().split()
        matrix_row = [float(trans_prob) for trans_prob in matrix_row_arry]
        transitions[i] = matrix_row
    
    # read the emitted symbols
    emitted_symbols = f_hmm_file.readline().split()
    
    # read the emission probability matrix
    emit_probs = [None for _ in range(K)]
    for i in range(K):
        matrix_row_arry = f_hmm_file.readline().split()
        matrix_row = [float(emit_prob) for emit_prob in matrix_row_arry]
        emit_probs[i] = matrix_row
    
    f_hmm_file.close()
    
    seq_dict = get_fasta_dict(input_file)
    emit_str = seq_dict.values()[0]  # there's only 1
    
    print "Done reading sequence of length ", len(emit_str)
    
    # Run the forward algorithm
    forward = run_forward(states, initial_probs, transitions, emitted_symbols, emit_probs, emit_str)
    
    # Run the backward algorithm
    backward = run_backward(states, initial_probs, transitions, emitted_symbols, emit_probs, emit_str)
    
    # Calculate the posterior probabilities
    # Initializing the posterior 2D matrices
    posterior = [[float(0) for _ in range(K)] for _ in range(len(emit_str))]
    for i in range(len(emit_str)):
        # Did not normalize the probabilities (i.e., did not divide by P(X)),
        # because we will only use these probabilities to compare
        # posterior[i][0] versus posterior[i][1]
        for k in range(K):
            posterior[i][k] = forward[i][k] + backward[i][k]
    
    # Print out the decoded results
    path = []
    for i in range(len(posterior)):
        path.append(posterior[i].index(max(posterior[i])))
    states_array = []
    state_start_index = 0
    previous_state = path[0]
    for i, state in enumerate(path):
        if(state!=previous_state):
            states_array.append((state_start_index + 1, i, previous_state))
            state_start_index = i
        previous_state = state
    states_array.append((state_start_index + 1, i + 1, previous_state))
    print "Start Stop State"
    state1_count = 0
    state2_count = 0
    for trip in states_array:
        if(trip[2]==1):
            state2_count = state2_count + 1
        if(trip[2]==0):
            state1_count = state1_count + 1
        print trip[0], trip[1], states[trip[2]]  
    print "total distinct state1 regions: " + str(state1_count)
    print "total distinct state2 regions: " + str(state2_count)