def posterior_decoding(): """Solution code below...""" #print "Enter the name of the HMM file:" #hmm_file = raw_input().strip() hmm_file = "HMMmethanococcus.txt" #print "Enter the name of the input file:" #sys.stdout.flush() #input_file = raw_input().strip() input_file = "bacterial.genome.fasta" f_in_file = open(input_file) f_hmm_file = open(hmm_file) if f_in_file is None: sys.exit("Can't open HMM file: " + hmm_file) if f_hmm_file is None: sys.exit("Can't open file: " + input_file) # read the state names states = f_hmm_file.readline().split() # read the initial probabilities probs = f_hmm_file.readline().split() initial_probs = [float(prob) for prob in probs] # read the transition matrix transitions = [None for _ in range(len(states))] for i in range(0, len(states)): matrix_row_arry = f_hmm_file.readline().split() matrix_row = [float(trans_prob) for trans_prob in matrix_row_arry] transitions[i] = matrix_row # read the emitted symbols emitted_symbols = f_hmm_file.readline().split() # read the emission probability matrix emit_probs = [None for _ in range(len(states))] for i in range(0, len(states)): matrix_row_arry = f_hmm_file.readline().split() matrix_row = [float(emit_prob) for emit_prob in matrix_row_arry] emit_probs[i] = matrix_row f_hmm_file.close() seq_dict = get_fasta_dict(input_file) emit_str = seq_dict.values()[0] #there's only 1 test = len(emit_str) print "Done reading sequence of length " , str(test) print "\n.... Done reading sequence of length " + str(test) #initializing the forward, backward and posterior 2D matrices forward = [[float(0) for _ in range(len(states))] for _ in range(len(emit_str))] backward = [[float(0) for _ in range(len(states))] for _ in range(len(emit_str))] posterior = [[float(0) for _ in range(len(states))] for _ in range(len(emit_str))] # Run the forward algorithm run_forward(states, initial_probs, transitions, emitted_symbols, emit_probs, emit_str, forward) # Run the backward algorithm run_backward(states, initial_probs, transitions, emitted_symbols, emit_probs, emit_str, backward) # Calculate the posterior probabilities for i in range(0, len(emit_str)): # Did not normalize the probabilities (i.e., did not divide by P(X)), # because we will only use these probability to compare # posterior[i][0] versus posterior[i][1]. for k in range(0, len(states)): posterior[i][k] = forward[i][k] + backward[i][k] # Print the decoded results best_path = "" for probabilities in posterior: if probabilities[0] > probabilities[1]: best_path += "0" else: best_path += "1" print "The best path is " + best_path print print "Start Stop State" state0 = True start = 0 end = 0 number_regions = 0 temporary_list = [] for i in range(len(best_path)): if (best_path[i] == "1" and state0 == True): temporary_tuple = (start,i,"1",i-start) print start, "\t", i, "\t", "1" temporary_list.append(temporary_tuple) start = i state0 = False if (best_path[i] == "0" and state0 == False): state = "2" temporary_tuple = (start,i,"2",i-start) print start, "\t", i, "\t", state temporary_list.append(temporary_tuple) start = i state0 = True number_regions += 1 print "There are", number_regions, "structural RNA regions"
def posterior_decoding(input_file, f_hmm_file): # read the state names and number states = f_hmm_file.readline().split() K = len(states) # read the initial probabilities probs = f_hmm_file.readline().split() initial_probs = [float(prob) for prob in probs] # read the transition matrix transitions = [None for _ in range(K)] for i in range(K): matrix_row_arry = f_hmm_file.readline().split() matrix_row = [float(trans_prob) for trans_prob in matrix_row_arry] transitions[i] = matrix_row # read the emission symbols emission_symbols = f_hmm_file.readline().split() # read the emission probability matrix emit_probs = [None for _ in range(K)] for i in range(K): matrix_row_arry = f_hmm_file.readline().split() matrix_row = [float(emit_prob) for emit_prob in matrix_row_arry] emit_probs[i] = matrix_row f_hmm_file.close() seq_dict = get_fasta_dict(input_file) emit_str = seq_dict.values()[0] # there's only 1 print "Done reading sequence of length ", len(emit_str) for a in range(len(emit_probs)): for c in range(len(transitions[0])): transitions[a][c] = log(transitions[a][c]) for b in range(len(emit_probs[0])): emit_probs[a][b] = log(emit_probs[a][b]) # Run the forward algorithm forward = run_forward(states, initial_probs, transitions, emission_symbols, emit_probs, emit_str) # Run the backward algorithm backward = run_backward(states, initial_probs, transitions, emission_symbols, emit_probs, emit_str) # Calculate the posterior probabilities # Initializing the posterior 2D matrices posterior = [[float(0) for _ in range(K)] for _ in range(len(emit_str))] for i in range(len(emit_str)): # Did not normalize the probabilities (i.e., did not divide by P(X)), # because we will only use these probabilities to compare # posterior[i][0] versus posterior[i][1] for k in range(K): posterior[i][k] = forward[i][k] + backward[i][k] # Print out the decoded results print "start\tstop\tstate" total_count = 0 state2_count = 0 curr_state = 0 start = 0 for i in range(len(posterior)): max = float('-inf') max_state = 0 for k in range(K): if (posterior[i][k] > max): max = posterior[i][k] max_state = k if (max_state != curr_state or i == len(posterior) - 1): total_count += 1 if curr_state == 1: state2_count += 1 if i == len(posterior) - 1: print start + 1, "\t", i + 1, "\tstate", curr_state + 1 else: print start + 1, "\t", i, "\tstate", curr_state + 1 start = i curr_state = max_state print "Total number of regions reported:", total_count print "Total number of state 2 regions:", state2_count
def posterior_decoding(input_file, hmm_file): """ Calculate the posterior decoding and return the decoded segments. input_file (str): path to input fasta file hmm_file (str): path to HMM file Returns: A list of dictionaries of segments in each state. An example output may look like: [ {‘start’: 0, ‘end’: 12, ‘state’: ‘state2’}, {‘start’: 13, ‘end’: 20, ‘state’: ‘state1’}, ... ] """ # Read in the input files f_in_file = open(input_file) f_hmm_file = open(hmm_file) if f_in_file is None: sys.exit("Can't open HMM file: " + hmm_file) if f_hmm_file is None: sys.exit("Can't open file: " + input_file) # read the state names and number states = f_hmm_file.readline().split() K = len(states) # read the initial probabilities probs = f_hmm_file.readline().split() initial_probs = [float(prob) for prob in probs] # read the transition matrix transitions = [None for _ in range(K)] for i in range(K): matrix_row_arry = f_hmm_file.readline().split() matrix_row = [float(trans_prob) for trans_prob in matrix_row_arry] transitions[i] = matrix_row # read the emission symbols emission_symbols = f_hmm_file.readline().split() # read the emission probability matrix emit_probs = [None for _ in range(K)] for i in range(K): matrix_row_arry = f_hmm_file.readline().split() matrix_row = [float(emit_prob) for emit_prob in matrix_row_arry] emit_probs[i] = matrix_row f_hmm_file.close() seq_dict = get_fasta_dict(input_file) emit_str = list(seq_dict.values())[0] # there's only 1 print(("Done reading sequence of length ", len(emit_str))) # Run the forward algorithm forward = run_forward(states, initial_probs, transitions, emission_symbols, emit_probs, emit_str) # Run the backward algorithm backward = run_backward(states, initial_probs, transitions, emission_symbols, emit_probs, emit_str) # Calculate the posterior probabilities # Initializing the posterior 2D matrices posterior = [[float(0) for _ in range(K)] for _ in range(len(emit_str))] for i in range(len(emit_str)): # Did not normalize the probabilities (i.e., did not divide by P(X)), # because we will only use these probabilities to compare # posterior[i][0] versus posterior[i][1] for k in range(K): posterior[i][k] = forward[i][k] + backward[i][k] # Create the list of decoded segments to return """YOUR CODE HERE""" return []
def posterior_decoding(f_in_file, f_hmm_file): # read the state names and number states = f_hmm_file.readline().split() K = len(states) # read the initial probabilities probs = f_hmm_file.readline().split() initial_probs = [float(prob) for prob in probs] # read the transition matrix transitions = [None for _ in range(K)] for i in range(K): matrix_row_arry = f_hmm_file.readline().split() matrix_row = [float(trans_prob) for trans_prob in matrix_row_arry] transitions[i] = matrix_row # read the emission symbols emission_symbols = f_hmm_file.readline().split() # read the emission probability matrix emit_probs = [None for _ in range(K)] for i in range(K): matrix_row_arry = f_hmm_file.readline().split() matrix_row = [float(emit_prob) for emit_prob in matrix_row_arry] emit_probs[i] = matrix_row f_hmm_file.close() seq_dict = get_fasta_dict(input_file) emit_str = seq_dict.values()[0] # there's only 1 print "Done reading sequence of length ", len(emit_str) # Run the forward algorithm forward = run_forward(states, initial_probs, transitions, emission_symbols, emit_probs, emit_str) # Run the backward algorithm backward = run_backward(states, initial_probs, transitions, emission_symbols, emit_probs, emit_str) # Calculate the posterior probabilities # Initializing the posterior 2D matrices posterior = [[float(0) for _ in range(K)] for _ in range(len(emit_str))] for i in range(len(emit_str)): # Did not normalize the probabilities (i.e., did not divide by P(X)), # because we will only use these probabilities to compare # posterior[i][0] versus posterior[i][1] for k in range(K): posterior[i][k] = forward[i][k] + backward[i][k] # Print out the decoded results # for post in posterior: # max_p # for k in range(K): trace = [] for post in posterior: max_post = float("-inf") max_state = 0 for k in range(K): if post[k] > max_post: max_post = post[k] max_state = k trace.append(max_state) # print trace[1627030:1627050] # print trace [1627041-1:1627042+2] # display_trix(trace) catch_top_ten(trace) catch_bottom_ten(trace)
def posterior_decoding(): hmm_file = raw_input("Enter the name of the HMM file:").strip() sys.stdout.flush() input_file = raw_input("Enter the name of the input file:").strip() f_in_file = open(input_file) f_hmm_file = open(hmm_file) if f_in_file is None: sys.exit("Can't open HMM file: " + hmm_file) if f_hmm_file is None: sys.exit("Can't open file: " + input_file) # read the state names and number states = f_hmm_file.readline().split() K = len(states) # read the initial probabilities probs = f_hmm_file.readline().split() initial_probs = [log(float(prob)) for prob in probs] # read the transition matrix transitions = [None for _ in range(K)] for i in range(K): matrix_row_arry = f_hmm_file.readline().split() matrix_row = [log(float(trans_prob)) for trans_prob in matrix_row_arry] transitions[i] = matrix_row # read the emitted symbols emitted_symbols = f_hmm_file.readline().split() # read the emission probability matrix emit_probs = [None for _ in range(K)] for i in range(K): matrix_row_arry = f_hmm_file.readline().split() matrix_row = [log(float(emit_prob)) for emit_prob in matrix_row_arry] emit_probs[i] = matrix_row f_hmm_file.close() seq_dict = get_fasta_dict(input_file) emit_str = seq_dict.values()[0] # there's only 1 print "Done reading sequence of length ", len(emit_str) # Run the forward algorithm forward = run_forward(states, initial_probs, transitions, emitted_symbols, emit_probs, emit_str) # Run the backward algorithm backward = run_backward(states, initial_probs, transitions, emitted_symbols, emit_probs, emit_str) # Calculate the posterior probabilities # Initializing the posterior 2D matrices posterior = [[float(0) for _ in range(K)] for _ in range(len(emit_str))] for i in range(len(emit_str)): # Did not normalize the probabilities (i.e., did not divide by P(X)), # because we will only use these probabilities to compare # posterior[i][0] versus posterior[i][1] for k in range(K): posterior[i][k] = forward[i][k] + backward[i][k] # Print the decoded results printout = [] for i in range(len(posterior)): max_at_index = float('-inf') max_k = float('-inf') for j in range(len(posterior[i])): if posterior[i][j] > max_at_index: max_at_index = posterior[i][j] max_k = j printout.append(max_k) # You could print it out, but it's very long: # print "".join([str(x) for x in printout]) prev_state = -1 prev_state_index = - 1 print print "start\tstop\tstate" for i in range(len(printout)): if i == 0: prev_state = printout[i] prev_state_index = 0 else: if prev_state != printout[i]: print str(prev_state_index+1) + "\t" + str(i) + "\t" + "state " + str(prev_state+1) prev_state = printout[i] prev_state_index = i # print the last sequence here! if prev_state_index < len(printout) - 1: print str(prev_state_index+1) + "\t" + str(len(printout)) + "\t" + "state " + str(prev_state+1)
def posterior_decoding(): hmm_file = raw_input("Enter the name of the HMM file:").strip() sys.stdout.flush() input_file = raw_input("Enter the name of the input file:").strip() f_in_file = open(input_file) f_hmm_file = open(hmm_file) if f_in_file is None: sys.exit("Can't open HMM file: " + hmm_file) if f_hmm_file is None: sys.exit("Can't open file: " + input_file) # read the state names and number states = f_hmm_file.readline().split() K = len(states) #print("Our states are: " + str(states)) #print("The number K of states is " + str(K)) # read the initial probabilities probs = f_hmm_file.readline().split() initial_probs = [float(prob) for prob in probs] #print("The initial probability of each of our states is " + str(initial_probs)) # read the transition matrix transitions = [None for _ in range(K)] for i in range(K): matrix_row_arry = f_hmm_file.readline().split() matrix_row = [float(trans_prob) for trans_prob in matrix_row_arry] transitions[i] = matrix_row #print("Our transition matrix between states is " + str(transitions)) # read the emission symbols emission_symbols = f_hmm_file.readline().split() #print("Our options for emitted symbols are " + str(emission_symbols)) # read the emission probability matrix emit_probs = [None for _ in range(K)] for i in range(K): matrix_row_arry = f_hmm_file.readline().split() matrix_row = [float(emit_prob) for emit_prob in matrix_row_arry] emit_probs[i] = matrix_row #print("Our emission probabilities are " + str(emit_probs)) f_hmm_file.close() seq_dict = get_fasta_dict(input_file) emit_str = seq_dict.values()[0] # there's only 1 print "Done reading sequence of length ", len(emit_str) # Run the forward algorithm forward = run_forward(states, initial_probs, transitions, emission_symbols, emit_probs, emit_str) # Run the backward algorithm backward = run_backward(states, initial_probs, transitions, emission_symbols, emit_probs, emit_str) # Calculate the posterior probabilities # Initializing the posterior 2D matrices posterior = [[float(0) for _ in range(K)] for _ in range(len(emit_str))] for i in range(len(emit_str)): # Did not normalize the probabilities (i.e., did not divide by P(X)), # because we will only use these probabilities to compare # posterior[i][0] versus posterior[i][1] for k in range(K): posterior[i][k] = forward[i][k] + backward[i][k] # Print out the decoded results #print("Final posterior matrix is " + str(posterior)) #Initialize an output path path = "" #Iterate in reverse order through the characters of our string for i in range(len(emit_str) - 1, -1, -1): #Initialize a maximum probability and a state index maxProb = float('-inf') stateIndex = 0 #Iterate through the potential states for this character for k in range(K): #Find the maximum viterbi value, and based off of this value, get the corresponding state #print("Checking row " + str(k)) if (posterior[i][k] > maxProb): maxProb = posterior[i][k] stateIndex = k path = path + str(stateIndex + 1) #print("Adding " + str(stateIndex+1)) #For the first value in our output path, save out an additional state correspodning the row of the viterbi value we chose #print(path) #print(emit_str) #print(path[::-1]) #Return and print our output in terms of the format requested (see formatPath method) output = formatPath(path[::-1], states) print("Output is " + str(output)) return output
def posterior_decoding(): hmm_file = raw_input("Enter the name of the HMM file:").strip() sys.stdout.flush() input_file = raw_input("Enter the name of the input file:").strip() f_in_file = open(input_file) f_hmm_file = open(hmm_file) if f_in_file is None: sys.exit("Can't open HMM file: " + hmm_file) if f_hmm_file is None: sys.exit("Can't open file: " + input_file) # read the state names and number states = f_hmm_file.readline().split() K = len(states) # read the initial probabilities probs = f_hmm_file.readline().split() initial_probs = [float(prob) for prob in probs] # read the transition matrix transitions = [None for _ in range(K)] for i in range(K): matrix_row_arry = f_hmm_file.readline().split() matrix_row = [float(trans_prob) for trans_prob in matrix_row_arry] transitions[i] = matrix_row # read the emitted symbols emitted_symbols = f_hmm_file.readline().split() # read the emission probability matrix emit_probs = [None for _ in range(K)] for i in range(K): matrix_row_arry = f_hmm_file.readline().split() matrix_row = [float(emit_prob) for emit_prob in matrix_row_arry] emit_probs[i] = matrix_row f_hmm_file.close() seq_dict = get_fasta_dict(input_file) emit_str = seq_dict.values()[0] # there's only 1 print "Done reading sequence of length ", len(emit_str) # Run the forward algorithm forward = run_forward(states, initial_probs, transitions, emitted_symbols, emit_probs, emit_str) # Run the backward algorithm backward = run_backward(states, initial_probs, transitions, emitted_symbols, emit_probs, emit_str) # Calculate the posterior probabilities # Initializing the posterior 2D matrices posterior = [[float(0) for _ in range(K)] for _ in range(len(emit_str))] for i in range(len(emit_str)): # Did not normalize the probabilities (i.e., did not divide by P(X)), # because we will only use these probabilities to compare # posterior[i][0] versus posterior[i][1] for k in range(K): posterior[i][k] = forward[i][k] + backward[i][k] # Print out the decoded results path = [] for i in range(len(posterior)): path.append(posterior[i].index(max(posterior[i]))) states_array = [] state_start_index = 0 previous_state = path[0] for i, state in enumerate(path): if(state!=previous_state): states_array.append((state_start_index + 1, i, previous_state)) state_start_index = i previous_state = state states_array.append((state_start_index + 1, i + 1, previous_state)) print "Start Stop State" state1_count = 0 state2_count = 0 for trip in states_array: if(trip[2]==1): state2_count = state2_count + 1 if(trip[2]==0): state1_count = state1_count + 1 print trip[0], trip[1], states[trip[2]] print "total distinct state1 regions: " + str(state1_count) print "total distinct state2 regions: " + str(state2_count)