def align_mates2(contigs, mates): mate_matches = dict() for i in range(len(mates)): mate_matches[i] = [[], []] for i in range(len(mates)): mate1 = mates[i][0] mate2 = mates[i][2] threshold_score = len(mate1) - (int(math.ceil(len(mate1) * 0.02)) * 2) for j in range(len(contigs)): contig = contigs[j] alignment_score1 = local_alignment(contig, mate1, 1, -1, -99999999, True) alignment_score2 = local_alignment(contig, mate2, 1, -1, -99999999, True) match1 = alignment_score1[0] >= threshold_score match2 = alignment_score2[0] >= threshold_score if not (match1 and match2): if match1: mate_matches[i][0].append([j, alignment_score1[1][0]]) print "mate " + str(i) + " left matched in contig " + str( j) elif match2: mate_matches[i][1].append([j, alignment_score2[1][0]]) print "mate " + str(i) + " right matched in contig " + str( j) return mate_matches
def align_mates2(contigs, mates): mate_matches = dict() for i in range(len(mates)): mate_matches[i] = [[],[]] for i in range(len(mates)): mate1 = mates[i][0] mate2 = mates[i][2] threshold_score = len(mate1) - (int(math.ceil(len(mate1)*0.02))*2) for j in range(len(contigs)): contig = contigs[j] alignment_score1 = local_alignment(contig, mate1, 1, -1, -99999999, True) alignment_score2 = local_alignment(contig, mate2, 1, -1, -99999999, True) match1 = alignment_score1[0] >= threshold_score match2 = alignment_score2[0] >= threshold_score if not(match1 and match2): if match1: mate_matches[i][0].append([j, alignment_score1[1][0]]) print "mate " + str(i) + " left matched in contig " + str(j) elif match2: mate_matches[i][1].append([j, alignment_score2[1][0]]) print "mate " + str(i) + " right matched in contig " + str(j) return mate_matches
def align_mates(contigs, mates): #record what mates match each contig in a dict contig_matches = {} for i in range(len(contigs)): contig = contigs[i] for j in range(len(mates)): mate1 = mates[j][0] mate2 = mates[j][2] alignment_score1 = local_alignment(contig, mate1, 1, -1, -99999999, True) alignment_score2 = local_alignment(contig, mate2, 1, -1, -99999999, True) #allow for 2% errors, threshold_score = len(mate1) - int(math.ceil(len(mate1) * 0.02)) print "contig: " + str(i) + " mate1: " + str(j) + " al1: " + str( alignment_score1) + " thresh: " + str(threshold_score) print "al2: " + str(alignment_score2) + " thresh: " + str( threshold_score) match1 = alignment_score1 >= threshold_score match2 = alignment_score2 >= threshold_score #if both ends match the contig, discard the mate if not (match1 and match2): if match1: if i in contig_matches.keys(): contig_matches[i].append([j, 1]) else: contig_matches[i] = [[j, 1]] elif match2: if i in contig_matches.keys(): contig_matches[i].append([j, 2]) else: contig_matches[i] = [[j, 2]] return contig_matches
def main(): print("Hello!") print("First, let's pick a substitution matrix.") sm_fn = input("please enter a filename: ") smat = generate_sub_matrix(sm_fn) print(smat.sub_scores) nuc_seq = input("Is this a nucleotide sequence? Y/N: ") filename1 = input("Please enter filename of first sequence: ") if nuc_seq == "Y" or nuc_seq == "y": cseq = read_file_as_protein(filename1) else: cseq = read_file(filename1) nuc_seq2 = input("Is this a nucleotide sequence? Y/N: ") filename2 = input("Please enter filename of second sequence: ") if nuc_seq2 == "Y" or nuc_seq2 == "y": rseq = read_file_as_protein(filename2) else: rseq = read_file(filename2) print("If this is a global alignment, enter 1") print("If this is a semiglobal alignment, enter 2") print("If this is a local alignment, enter 3") aln = input("Please enter 1, 2, or 3: ") gap_open = input("Please enter a gap score: ") if aln=="1": alignment = global_alignment(smat, cseq, rseq, False, int(gap_open)) elif aln=="2": alignment = global_alignment(smat, cseq, rseq, True, int(gap_open)) else: alignment = local_alignment(smat,cseq,rseq,int(gap_open)) print_formatted(alignment.out,4) print_formatted(alignment.dirct,3) print_seq(alignment.arr1,alignment.arr2)
def contig_dict(reads, contigs, error_rate): #where does each read map? do gapless local alignment. allow a number of mismatches based on error rate read_dict ={} for read in reads: for contig in contigs: score = local_alignment(read, contig, 1, -1, -99, True) #print score alignment = local_alignment(read, contig, 1, -1, -99, False) #print alignment threshold_score = len(read) - 5 #(int(round(error_rate * len(contig)))+1) if score >= threshold_score: alignment_pos = string.find(contig, alignment[1]) #print "STORING! " + str([contigs.index(contig), alignment_pos]) if read in read_dict: #print "append case" read_dict[read].append([contigs.index(contig), alignment_pos]) else: read_dict[read] = [[contigs.index(contig), alignment_pos]] return read_dict
def contig_dict(reads, contigs, error_rate): #where does each read map? do gapless local alignment. allow a number of mismatches based on error rate read_dict = {} for read in reads: for contig in contigs: score = local_alignment(read, contig, 1, -1, -99, True) #print score alignment = local_alignment(read, contig, 1, -1, -99, False) #print alignment threshold_score = len( read) - 5 #(int(round(error_rate * len(contig)))+1) if score >= threshold_score: alignment_pos = string.find(contig, alignment[1]) #print "STORING! " + str([contigs.index(contig), alignment_pos]) if read in read_dict: #print "append case" read_dict[read].append( [contigs.index(contig), alignment_pos]) else: read_dict[read] = [[contigs.index(contig), alignment_pos]] return read_dict
def tour_bus(G, score, alignment_params): #print 'starting tour ' for start_node in G.nodes(): if start_node in G.nodes(): #print "starting tour bus from " + start_node #bfs exploration of the graph. Visit nodes in increasing distance from the origin (the first node in the graph for now) # Need to make sure we cover all possible nodes. start = start_node previous = start distances = {} distances[start] = 0.0 visited = [start] #have we found a bubble? have_merged=False while not have_merged: #unveil neighbors of previous new_nodes = [a for a in G.neighbors(previous) if a != previous] for i in new_nodes: if i in visited: #print i + " visited before!" #found a node that we've been to before! # backtrack and find closest common ancestor. both predecessors of i will be in the visited set. will always have 2 elements. to_traceback = [a for a in G.predecessors(i) if a in visited] #yay for literally the worst way to catch an error if len(to_traceback) <2: break #traceback path 1. shouldn't encounter a node that has more than one predecessor in the visited set traceback_path_1 = [] traceback_pred_1 = [to_traceback[0]] while traceback_pred_1 != []: traceback_path_1.append(traceback_pred_1[0]) #print G.predecessors(traceback_pred_1[0]) traceback_pred_1 = [a for a in G.predecessors(traceback_pred_1[0]) if (a in visited) and (a not in traceback_path_1)] #traceback path 2. traceback_path_2 = [] traceback_pred_2 = [to_traceback[1]] while traceback_pred_2 != []: traceback_path_2.append(traceback_pred_2[0]) traceback_pred_2 = [a for a in G.predecessors(traceback_pred_2[0]) if (a in visited) and (a not in traceback_path_2)] #find first element that occurs in both #print "tpath1: " + str(traceback_path_1) + " tpath2:" + str(traceback_path_2) overlap_first = [j for j in traceback_path_1 if j in traceback_path_2][0] #find first overlap between lists overlap_pos1 = traceback_path_1.index(overlap_first) overlap_pos2 = traceback_path_2.index(overlap_first) #get rid of path past the overlap traceback_path_1 = traceback_path_1[:overlap_pos1] traceback_path_2 = traceback_path_2[:overlap_pos2] forward_path_1 = traceback_path_1[::-1] #print "forward_path_1: " + str(forward_path_1) forward_path_2 = traceback_path_2[::-1] #print "forward_path_2: " + str(forward_path_2) #extract sequences from corresponding paths sequence_1 = '' for j in forward_path_1: if sequence_1 != '': sequence_1 = overlap(sequence_1,j) else: sequence_1 = j #print sequence_1 sequence_2 = '' for j in forward_path_2: if sequence_2 != '': sequence_2 = overlap(sequence_2, j) else: sequence_2 = j #print sequence_2 #sequence to continue with reached end node first. #that's the path that doesn't have 'previous' in it #print "Trying to find what sequence to merge. looking for " + previous if previous in forward_path_1: seq_to_merge = 2 elif previous in forward_path_2: seq_to_merge =1 else: print "should never get here!" return False #align the two sequences. use local alignment and a strict score #could later convert this to a relative percentage match, something # like the 80% they use in velvet. merge = False alignment_score = local_alignment(sequence_1,sequence_2, alignment_params[0], alignment_params[1],alignment_params[2],True) if alignment_score > score: merge=True #Should the two sequences be merged? if merge: have_merged=True #do merging! if seq_to_merge == 1: #path 1 is being represented in the merged graph. start at the beginning, look at the corresponding node in the other path # align and decide to merge. then copy coverage information (addative) and edges j = 0 while j < len(forward_path_1): k=j while k < len(forward_path_2): if (local_alignment(forward_path_1[j],forward_path_2[k],alignment_params[0], alignment_params[1],alignment_params[2],True) > score): #merge these and copy info G.node[forward_path_1[j]]['num'] += G.node[forward_path_2[k]]['num'] #copy (incoming edges to forward_path_2[k]) to forward_path_1[j] for l in G.predecessors(forward_path_2[k]): G.add_edge(l,forward_path_1[j]) #copy (outgoing edges from forward_path_2[k]) to forward_path_1[j] for l in G.neighbors(forward_path_2[k]): G.add_edge(forward_path_1[j],l) #delete forward_path_2[k] G.remove_node(forward_path_2[k]) j=k k = len(forward_path_2) else: k+=1 j+=1 if seq_to_merge == 2: #path 2 is being represented in the merged graph. start at the beginning, look at the corresponding node in the other path # align and decide to merge. then copy coverage information (addative) and edges j = 0 while j < len(forward_path_2): k=j while k < len(forward_path_1): if (local_alignment(forward_path_2[j],forward_path_1[k],alignment_params[0], alignment_params[1],alignment_params[2],True) > score): #merge these and copy info G.node[forward_path_2[j]]['num'] += G.node[forward_path_1[k]]['num'] #copy (incoming edges to forward_path_1[k]) to forward_path_2[j] for l in G.predecessors(forward_path_1[k]): G.add_edge(l,forward_path_2[j]) #copy (outgoing edges from forward_path_1[k]) to forward_path_2[j] for l in G.neighbors(forward_path_1[k]): G.add_edge(forward_path_2[j],l) #delete forward_path_1[k] G.remove_node(forward_path_1[k]) j=k k = len(forward_path_1) else: k+=1 j+=1 return True else: #calculate distance to new nodes distances[i] = distances[previous] + (float(len(i)) / G.node[i]['num']) visited.append(i) if not have_merged: if len(distances)>1: del distances[previous] #find node with minimum distance previous = min(distances, key=distances.get) else: can_merge = False have_merged=True return False
def tour_bus(G, score, alignment_params): #print 'starting tour ' for start_node in G.nodes(): if start_node in G.nodes(): #print "starting tour bus from " + start_node #bfs exploration of the graph. Visit nodes in increasing distance from the origin (the first node in the graph for now) # Need to make sure we cover all possible nodes. start = start_node previous = start distances = {} distances[start] = 0.0 visited = [start] #have we found a bubble? have_merged = False while not have_merged: #unveil neighbors of previous new_nodes = [a for a in G.neighbors(previous) if a != previous] for i in new_nodes: if i in visited: #print i + " visited before!" #found a node that we've been to before! # backtrack and find closest common ancestor. both predecessors of i will be in the visited set. will always have 2 elements. to_traceback = [ a for a in G.predecessors(i) if a in visited ] #yay for literally the worst way to catch an error if len(to_traceback) < 2: break #traceback path 1. shouldn't encounter a node that has more than one predecessor in the visited set traceback_path_1 = [] traceback_pred_1 = [to_traceback[0]] while traceback_pred_1 != []: traceback_path_1.append(traceback_pred_1[0]) #print G.predecessors(traceback_pred_1[0]) traceback_pred_1 = [ a for a in G.predecessors(traceback_pred_1[0]) if (a in visited) and ( a not in traceback_path_1) ] #traceback path 2. traceback_path_2 = [] traceback_pred_2 = [to_traceback[1]] while traceback_pred_2 != []: traceback_path_2.append(traceback_pred_2[0]) traceback_pred_2 = [ a for a in G.predecessors(traceback_pred_2[0]) if (a in visited) and ( a not in traceback_path_2) ] #find first element that occurs in both #print "tpath1: " + str(traceback_path_1) + " tpath2:" + str(traceback_path_2) overlap_first = [ j for j in traceback_path_1 if j in traceback_path_2 ][0] #find first overlap between lists overlap_pos1 = traceback_path_1.index(overlap_first) overlap_pos2 = traceback_path_2.index(overlap_first) #get rid of path past the overlap traceback_path_1 = traceback_path_1[:overlap_pos1] traceback_path_2 = traceback_path_2[:overlap_pos2] forward_path_1 = traceback_path_1[::-1] #print "forward_path_1: " + str(forward_path_1) forward_path_2 = traceback_path_2[::-1] #print "forward_path_2: " + str(forward_path_2) #extract sequences from corresponding paths sequence_1 = '' for j in forward_path_1: if sequence_1 != '': sequence_1 = overlap(sequence_1, j) else: sequence_1 = j #print sequence_1 sequence_2 = '' for j in forward_path_2: if sequence_2 != '': sequence_2 = overlap(sequence_2, j) else: sequence_2 = j #print sequence_2 #sequence to continue with reached end node first. #that's the path that doesn't have 'previous' in it #print "Trying to find what sequence to merge. looking for " + previous if previous in forward_path_1: seq_to_merge = 2 elif previous in forward_path_2: seq_to_merge = 1 else: print "should never get here!" return False #align the two sequences. use local alignment and a strict score #could later convert this to a relative percentage match, something # like the 80% they use in velvet. merge = False alignment_score = local_alignment( sequence_1, sequence_2, alignment_params[0], alignment_params[1], alignment_params[2], True) if alignment_score > score: merge = True #Should the two sequences be merged? if merge: have_merged = True #do merging! if seq_to_merge == 1: #path 1 is being represented in the merged graph. start at the beginning, look at the corresponding node in the other path # align and decide to merge. then copy coverage information (addative) and edges j = 0 while j < len(forward_path_1): k = j while k < len(forward_path_2): if (local_alignment( forward_path_1[j], forward_path_2[k], alignment_params[0], alignment_params[1], alignment_params[2], True) > score): #merge these and copy info G.node[forward_path_1[j]][ 'num'] += G.node[ forward_path_2[k]]['num'] #copy (incoming edges to forward_path_2[k]) to forward_path_1[j] for l in G.predecessors( forward_path_2[k]): G.add_edge( l, forward_path_1[j]) #copy (outgoing edges from forward_path_2[k]) to forward_path_1[j] for l in G.neighbors( forward_path_2[k]): G.add_edge( forward_path_1[j], l) #delete forward_path_2[k] G.remove_node(forward_path_2[k]) j = k k = len(forward_path_2) else: k += 1 j += 1 if seq_to_merge == 2: #path 2 is being represented in the merged graph. start at the beginning, look at the corresponding node in the other path # align and decide to merge. then copy coverage information (addative) and edges j = 0 while j < len(forward_path_2): k = j while k < len(forward_path_1): if (local_alignment( forward_path_2[j], forward_path_1[k], alignment_params[0], alignment_params[1], alignment_params[2], True) > score): #merge these and copy info G.node[forward_path_2[j]][ 'num'] += G.node[ forward_path_1[k]]['num'] #copy (incoming edges to forward_path_1[k]) to forward_path_2[j] for l in G.predecessors( forward_path_1[k]): G.add_edge( l, forward_path_2[j]) #copy (outgoing edges from forward_path_1[k]) to forward_path_2[j] for l in G.neighbors( forward_path_1[k]): G.add_edge( forward_path_2[j], l) #delete forward_path_1[k] G.remove_node(forward_path_1[k]) j = k k = len(forward_path_1) else: k += 1 j += 1 return True else: #calculate distance to new nodes distances[i] = distances[previous] + (float(len(i)) / G.node[i]['num']) visited.append(i) if not have_merged: if len(distances) > 1: del distances[previous] #find node with minimum distance previous = min(distances, key=distances.get) else: can_merge = False have_merged = True return False