Example #1
0
def align_mates2(contigs, mates):

    mate_matches = dict()
    for i in range(len(mates)):
        mate_matches[i] = [[], []]

    for i in range(len(mates)):
        mate1 = mates[i][0]
        mate2 = mates[i][2]
        threshold_score = len(mate1) - (int(math.ceil(len(mate1) * 0.02)) * 2)
        for j in range(len(contigs)):
            contig = contigs[j]
            alignment_score1 = local_alignment(contig, mate1, 1, -1, -99999999,
                                               True)
            alignment_score2 = local_alignment(contig, mate2, 1, -1, -99999999,
                                               True)

            match1 = alignment_score1[0] >= threshold_score
            match2 = alignment_score2[0] >= threshold_score

            if not (match1 and match2):
                if match1:
                    mate_matches[i][0].append([j, alignment_score1[1][0]])
                    print "mate " + str(i) + " left matched in contig " + str(
                        j)
                elif match2:
                    mate_matches[i][1].append([j, alignment_score2[1][0]])
                    print "mate " + str(i) + " right matched in contig " + str(
                        j)
    return mate_matches
Example #2
0
def align_mates2(contigs, mates):

	mate_matches = dict()
	for i in range(len(mates)):
		mate_matches[i] = [[],[]]

	for i in range(len(mates)):
		mate1 = mates[i][0]
		mate2 = mates[i][2]
		threshold_score = len(mate1) - (int(math.ceil(len(mate1)*0.02))*2)
		for j in range(len(contigs)):
			contig = contigs[j]
			alignment_score1 = local_alignment(contig, mate1, 1, -1, -99999999, True)
			alignment_score2 = local_alignment(contig, mate2, 1, -1, -99999999, True)

			match1 = alignment_score1[0] >= threshold_score
			match2 = alignment_score2[0] >= threshold_score

			if not(match1 and match2):
				if match1:
					mate_matches[i][0].append([j, alignment_score1[1][0]])
					print "mate " + str(i) + " left matched in contig " + str(j)
				elif match2:
					mate_matches[i][1].append([j, alignment_score2[1][0]])
					print "mate " + str(i) + " right matched in contig " + str(j)
	return mate_matches
Example #3
0
def align_mates(contigs, mates):
    #record what mates match each contig in a dict
    contig_matches = {}

    for i in range(len(contigs)):
        contig = contigs[i]
        for j in range(len(mates)):
            mate1 = mates[j][0]
            mate2 = mates[j][2]
            alignment_score1 = local_alignment(contig, mate1, 1, -1, -99999999,
                                               True)
            alignment_score2 = local_alignment(contig, mate2, 1, -1, -99999999,
                                               True)
            #allow for 2% errors,
            threshold_score = len(mate1) - int(math.ceil(len(mate1) * 0.02))
            print "contig: " + str(i) + " mate1: " + str(j) + " al1: " + str(
                alignment_score1) + " thresh: " + str(threshold_score)
            print "al2: " + str(alignment_score2) + " thresh: " + str(
                threshold_score)
            match1 = alignment_score1 >= threshold_score
            match2 = alignment_score2 >= threshold_score

            #if both ends match the contig, discard the mate
            if not (match1 and match2):
                if match1:
                    if i in contig_matches.keys():
                        contig_matches[i].append([j, 1])
                    else:
                        contig_matches[i] = [[j, 1]]
                elif match2:
                    if i in contig_matches.keys():
                        contig_matches[i].append([j, 2])
                    else:
                        contig_matches[i] = [[j, 2]]
    return contig_matches
Example #4
0
def main():
    
    print("Hello!")
    print("First, let's pick a substitution matrix.")
    sm_fn = input("please enter a filename: ")
    smat = generate_sub_matrix(sm_fn)
    print(smat.sub_scores)
    nuc_seq = input("Is this a nucleotide sequence? Y/N: ")
    filename1 = input("Please enter filename of first sequence: ")
    if nuc_seq == "Y" or nuc_seq == "y":
        cseq = read_file_as_protein(filename1)
    else:
        cseq = read_file(filename1)
        
    nuc_seq2 = input("Is this a nucleotide sequence? Y/N: ")
    filename2 = input("Please enter filename of second sequence: ")
    if nuc_seq2 == "Y" or nuc_seq2 == "y":
        rseq = read_file_as_protein(filename2)
    else:
        rseq = read_file(filename2)
    print("If this is a global alignment, enter 1")
    print("If this is a semiglobal alignment, enter 2")
    print("If this is a local alignment, enter 3")
    aln = input("Please enter 1, 2, or 3: ")
    gap_open = input("Please enter a gap score: ")
    if aln=="1":
        alignment = global_alignment(smat, cseq, rseq, False, int(gap_open))
    elif aln=="2":
        alignment = global_alignment(smat, cseq, rseq, True, int(gap_open))
    else:
        alignment = local_alignment(smat,cseq,rseq,int(gap_open))

    print_formatted(alignment.out,4)
    print_formatted(alignment.dirct,3)
    print_seq(alignment.arr1,alignment.arr2)
Example #5
0
def contig_dict(reads, contigs, error_rate): 
	#where does each read map? do gapless local alignment. allow a number of mismatches based on error rate
	read_dict ={}
	for read in reads:
		for contig in contigs: 
			score = local_alignment(read, contig, 1, -1, -99, True)
			#print score
			alignment = local_alignment(read, contig, 1, -1, -99, False)
			#print alignment
			threshold_score = len(read) - 5 #(int(round(error_rate * len(contig)))+1)
			if score >= threshold_score:
				alignment_pos = string.find(contig, alignment[1])

				#print "STORING! " + str([contigs.index(contig), alignment_pos])
				if read in read_dict:
					#print "append case"
					read_dict[read].append([contigs.index(contig), alignment_pos])
				else:

					read_dict[read] = [[contigs.index(contig), alignment_pos]]
	return read_dict
Example #6
0
def contig_dict(reads, contigs, error_rate):
    #where does each read map? do gapless local alignment. allow a number of mismatches based on error rate
    read_dict = {}
    for read in reads:
        for contig in contigs:
            score = local_alignment(read, contig, 1, -1, -99, True)
            #print score
            alignment = local_alignment(read, contig, 1, -1, -99, False)
            #print alignment
            threshold_score = len(
                read) - 5  #(int(round(error_rate * len(contig)))+1)
            if score >= threshold_score:
                alignment_pos = string.find(contig, alignment[1])

                #print "STORING! " + str([contigs.index(contig), alignment_pos])
                if read in read_dict:
                    #print "append case"
                    read_dict[read].append(
                        [contigs.index(contig), alignment_pos])
                else:

                    read_dict[read] = [[contigs.index(contig), alignment_pos]]
    return read_dict
Example #7
0
def tour_bus(G, score, alignment_params):
	#print 'starting tour '
	for start_node in G.nodes():
		if start_node in G.nodes():
			#print "starting tour bus from " + start_node
			#bfs exploration of the graph. Visit nodes in increasing distance from the origin (the first node in the graph for now)
			# Need to make sure we cover all possible nodes. 
			start = start_node
			previous = start
			distances = {}
			distances[start] = 0.0
			visited = [start]
			#have we found a bubble?
			have_merged=False
			while not have_merged:
				#unveil neighbors of previous 
				new_nodes = [a for a in G.neighbors(previous) if a != previous]
				for i in new_nodes:
					if i in visited:
						#print i + " visited before!"
						#found a node that we've been to before!
						# backtrack and find closest common ancestor. both predecessors of i will be in the visited set. will always have 2 elements.
						to_traceback = [a for a in G.predecessors(i) if a in visited]
						
						#yay for literally the worst way to catch an error
						if len(to_traceback) <2:
							break

						#traceback path 1. shouldn't encounter a node that has more than one predecessor in the visited set
						traceback_path_1 = []
						traceback_pred_1 = [to_traceback[0]]
						while traceback_pred_1 != []:
							traceback_path_1.append(traceback_pred_1[0])
							#print G.predecessors(traceback_pred_1[0])
							traceback_pred_1 = [a for a in G.predecessors(traceback_pred_1[0]) if (a in visited) and (a not in traceback_path_1)]

						#traceback path 2. 
						traceback_path_2 = []
						traceback_pred_2 = [to_traceback[1]]
						while traceback_pred_2 != []:
							traceback_path_2.append(traceback_pred_2[0])
							traceback_pred_2 = [a for a in G.predecessors(traceback_pred_2[0]) if (a in visited) and (a not in traceback_path_2)]
						#find first element that occurs in both
						#print "tpath1: " + str(traceback_path_1) + "   tpath2:" + str(traceback_path_2)
						
						overlap_first = [j for j in traceback_path_1 if j in traceback_path_2][0]

						#find first overlap between lists
						overlap_pos1 = traceback_path_1.index(overlap_first)
						overlap_pos2 = traceback_path_2.index(overlap_first)

						#get rid of path past the overlap
						traceback_path_1 = traceback_path_1[:overlap_pos1]
						traceback_path_2 = traceback_path_2[:overlap_pos2]
						forward_path_1 = traceback_path_1[::-1]
						#print "forward_path_1: " + str(forward_path_1)
						forward_path_2 = traceback_path_2[::-1]
						#print "forward_path_2: " + str(forward_path_2)

						#extract sequences from corresponding paths
						sequence_1 = '' 
						for j in forward_path_1:
							if sequence_1 != '':
								sequence_1 = overlap(sequence_1,j)
							else: 
								sequence_1 = j
						#print sequence_1
						sequence_2 = '' 
						for j in forward_path_2:
							if sequence_2 != '':
								sequence_2 = overlap(sequence_2, j)
							else: 
								sequence_2 = j
						#print sequence_2

						#sequence to continue with reached end node first.
						#that's the path that doesn't have 'previous' in it
						#print "Trying to find what sequence to merge. looking for " + previous
						if previous in forward_path_1:
							seq_to_merge = 2
						elif previous in forward_path_2:
							seq_to_merge =1 
						else: 
							print "should never get here!"
							return False

						#align the two sequences. use local alignment and a strict score
						#could later convert this to a relative percentage match, something 
						# like the 80% they use in velvet. 
						merge = False
						alignment_score = local_alignment(sequence_1,sequence_2, alignment_params[0], alignment_params[1],alignment_params[2],True)
						if alignment_score >  score:
							merge=True

						#Should the two sequences be merged?
						if merge:
							have_merged=True
							#do merging!
							if seq_to_merge == 1:
								#path 1 is being represented in the merged graph. start at the beginning, look at the corresponding node in the other path
								# align and decide to merge. then copy coverage information (addative) and edges	
								j = 0 
								while j < len(forward_path_1):
									k=j
									while k < len(forward_path_2):
										if (local_alignment(forward_path_1[j],forward_path_2[k],alignment_params[0], alignment_params[1],alignment_params[2],True) > score):
											#merge these and copy info
											G.node[forward_path_1[j]]['num'] += G.node[forward_path_2[k]]['num']
											#copy (incoming edges to forward_path_2[k]) to forward_path_1[j]
											for l in G.predecessors(forward_path_2[k]):
												G.add_edge(l,forward_path_1[j])
											#copy (outgoing edges from forward_path_2[k]) to forward_path_1[j]
											for l in G.neighbors(forward_path_2[k]):
												G.add_edge(forward_path_1[j],l)
											#delete forward_path_2[k]
											G.remove_node(forward_path_2[k])
											j=k
											k = len(forward_path_2)
										else: k+=1
									j+=1
							if seq_to_merge == 2:
								#path 2 is being represented in the merged graph. start at the beginning, look at the corresponding node in the other path
								# align and decide to merge. then copy coverage information (addative) and edges
								j = 0 
								while j < len(forward_path_2):
									k=j
									while k < len(forward_path_1):
										if (local_alignment(forward_path_2[j],forward_path_1[k],alignment_params[0], alignment_params[1],alignment_params[2],True) > score):
											#merge these and copy info
											G.node[forward_path_2[j]]['num'] += G.node[forward_path_1[k]]['num']
											#copy (incoming edges to forward_path_1[k]) to forward_path_2[j]
											for l in G.predecessors(forward_path_1[k]):
												G.add_edge(l,forward_path_2[j])
											#copy (outgoing edges from forward_path_1[k]) to forward_path_2[j]
											for l in G.neighbors(forward_path_1[k]):
												G.add_edge(forward_path_2[j],l)
											#delete forward_path_1[k]
											G.remove_node(forward_path_1[k])
											j=k
											k = len(forward_path_1)
										else: k+=1
									j+=1
							return True
					else: 
						#calculate distance to new nodes
						distances[i] = distances[previous] + (float(len(i)) / G.node[i]['num'])
						visited.append(i)
				
				if not have_merged:
					if len(distances)>1:
						del distances[previous]
						#find node with minimum distance
						previous = min(distances, key=distances.get)
					else: 
						can_merge = False
						have_merged=True
	return False
Example #8
0
def tour_bus(G, score, alignment_params):
    #print 'starting tour '
    for start_node in G.nodes():
        if start_node in G.nodes():
            #print "starting tour bus from " + start_node
            #bfs exploration of the graph. Visit nodes in increasing distance from the origin (the first node in the graph for now)
            # Need to make sure we cover all possible nodes.
            start = start_node
            previous = start
            distances = {}
            distances[start] = 0.0
            visited = [start]
            #have we found a bubble?
            have_merged = False
            while not have_merged:
                #unveil neighbors of previous
                new_nodes = [a for a in G.neighbors(previous) if a != previous]
                for i in new_nodes:
                    if i in visited:
                        #print i + " visited before!"
                        #found a node that we've been to before!
                        # backtrack and find closest common ancestor. both predecessors of i will be in the visited set. will always have 2 elements.
                        to_traceback = [
                            a for a in G.predecessors(i) if a in visited
                        ]

                        #yay for literally the worst way to catch an error
                        if len(to_traceback) < 2:
                            break

                        #traceback path 1. shouldn't encounter a node that has more than one predecessor in the visited set
                        traceback_path_1 = []
                        traceback_pred_1 = [to_traceback[0]]
                        while traceback_pred_1 != []:
                            traceback_path_1.append(traceback_pred_1[0])
                            #print G.predecessors(traceback_pred_1[0])
                            traceback_pred_1 = [
                                a for a in G.predecessors(traceback_pred_1[0])
                                if (a in visited) and (
                                    a not in traceback_path_1)
                            ]

                        #traceback path 2.
                        traceback_path_2 = []
                        traceback_pred_2 = [to_traceback[1]]
                        while traceback_pred_2 != []:
                            traceback_path_2.append(traceback_pred_2[0])
                            traceback_pred_2 = [
                                a for a in G.predecessors(traceback_pred_2[0])
                                if (a in visited) and (
                                    a not in traceback_path_2)
                            ]
                        #find first element that occurs in both
                        #print "tpath1: " + str(traceback_path_1) + "   tpath2:" + str(traceback_path_2)

                        overlap_first = [
                            j for j in traceback_path_1
                            if j in traceback_path_2
                        ][0]

                        #find first overlap between lists
                        overlap_pos1 = traceback_path_1.index(overlap_first)
                        overlap_pos2 = traceback_path_2.index(overlap_first)

                        #get rid of path past the overlap
                        traceback_path_1 = traceback_path_1[:overlap_pos1]
                        traceback_path_2 = traceback_path_2[:overlap_pos2]
                        forward_path_1 = traceback_path_1[::-1]
                        #print "forward_path_1: " + str(forward_path_1)
                        forward_path_2 = traceback_path_2[::-1]
                        #print "forward_path_2: " + str(forward_path_2)

                        #extract sequences from corresponding paths
                        sequence_1 = ''
                        for j in forward_path_1:
                            if sequence_1 != '':
                                sequence_1 = overlap(sequence_1, j)
                            else:
                                sequence_1 = j
                        #print sequence_1
                        sequence_2 = ''
                        for j in forward_path_2:
                            if sequence_2 != '':
                                sequence_2 = overlap(sequence_2, j)
                            else:
                                sequence_2 = j
                        #print sequence_2

                        #sequence to continue with reached end node first.
                        #that's the path that doesn't have 'previous' in it
                        #print "Trying to find what sequence to merge. looking for " + previous
                        if previous in forward_path_1:
                            seq_to_merge = 2
                        elif previous in forward_path_2:
                            seq_to_merge = 1
                        else:
                            print "should never get here!"
                            return False

                        #align the two sequences. use local alignment and a strict score
                        #could later convert this to a relative percentage match, something
                        # like the 80% they use in velvet.
                        merge = False
                        alignment_score = local_alignment(
                            sequence_1, sequence_2, alignment_params[0],
                            alignment_params[1], alignment_params[2], True)
                        if alignment_score > score:
                            merge = True

                        #Should the two sequences be merged?
                        if merge:
                            have_merged = True
                            #do merging!
                            if seq_to_merge == 1:
                                #path 1 is being represented in the merged graph. start at the beginning, look at the corresponding node in the other path
                                # align and decide to merge. then copy coverage information (addative) and edges
                                j = 0
                                while j < len(forward_path_1):
                                    k = j
                                    while k < len(forward_path_2):
                                        if (local_alignment(
                                                forward_path_1[j],
                                                forward_path_2[k],
                                                alignment_params[0],
                                                alignment_params[1],
                                                alignment_params[2], True) >
                                                score):
                                            #merge these and copy info
                                            G.node[forward_path_1[j]][
                                                'num'] += G.node[
                                                    forward_path_2[k]]['num']
                                            #copy (incoming edges to forward_path_2[k]) to forward_path_1[j]
                                            for l in G.predecessors(
                                                    forward_path_2[k]):
                                                G.add_edge(
                                                    l, forward_path_1[j])
                                            #copy (outgoing edges from forward_path_2[k]) to forward_path_1[j]
                                            for l in G.neighbors(
                                                    forward_path_2[k]):
                                                G.add_edge(
                                                    forward_path_1[j], l)
                                            #delete forward_path_2[k]
                                            G.remove_node(forward_path_2[k])
                                            j = k
                                            k = len(forward_path_2)
                                        else:
                                            k += 1
                                    j += 1
                            if seq_to_merge == 2:
                                #path 2 is being represented in the merged graph. start at the beginning, look at the corresponding node in the other path
                                # align and decide to merge. then copy coverage information (addative) and edges
                                j = 0
                                while j < len(forward_path_2):
                                    k = j
                                    while k < len(forward_path_1):
                                        if (local_alignment(
                                                forward_path_2[j],
                                                forward_path_1[k],
                                                alignment_params[0],
                                                alignment_params[1],
                                                alignment_params[2], True) >
                                                score):
                                            #merge these and copy info
                                            G.node[forward_path_2[j]][
                                                'num'] += G.node[
                                                    forward_path_1[k]]['num']
                                            #copy (incoming edges to forward_path_1[k]) to forward_path_2[j]
                                            for l in G.predecessors(
                                                    forward_path_1[k]):
                                                G.add_edge(
                                                    l, forward_path_2[j])
                                            #copy (outgoing edges from forward_path_1[k]) to forward_path_2[j]
                                            for l in G.neighbors(
                                                    forward_path_1[k]):
                                                G.add_edge(
                                                    forward_path_2[j], l)
                                            #delete forward_path_1[k]
                                            G.remove_node(forward_path_1[k])
                                            j = k
                                            k = len(forward_path_1)
                                        else:
                                            k += 1
                                    j += 1
                            return True
                    else:
                        #calculate distance to new nodes
                        distances[i] = distances[previous] + (float(len(i)) /
                                                              G.node[i]['num'])
                        visited.append(i)

                if not have_merged:
                    if len(distances) > 1:
                        del distances[previous]
                        #find node with minimum distance
                        previous = min(distances, key=distances.get)
                    else:
                        can_merge = False
                        have_merged = True
    return False