def check(self): for v in self.vs.itervalues(): assert v.conj, "Some vertex have no conjugate" for e in self.es.itervalues(): assert e.conj, "Some edge have no conjugate" for e in self.es.itervalues(): assert self.K == len(e.seq) - e.len, "Inconsistent K" for e in self.es.itervalues(): assert e.seq == utils.rc(e.conj.seq), (e.seq, utils.rc(e.conj.seq))
def __get_kmers_pos(self, genome, k): kmers = dict() for i in range(len(genome) - k + 1): kmer = genome[i:i + k] if kmer not in kmers: kmers[kmer] = [i + 1] kmers[utils.rc(kmer)] = [-(len(genome) - i - k + 1)] else: kmers[kmer].append(i + 1) kmers[utils.rc(kmer)].append(-(len(genome) - i - k + 1)) return kmers
def __get_kmers_pos(self, genome, k): kmers = dict() for i in range(len(genome) - k + 1): kmer = genome[i: i +k] if kmer not in kmers: kmers[kmer] = [i+1] kmers[utils.rc(kmer)] = [-(len(genome) - i -k + 1)] else: kmers[kmer].append(i+1) kmers[utils.rc(kmer)].append(-(len(genome)- i - k+1)) return kmers
def extract_path(G, path, type="str"): logging.debug("Extracting path of length: %d" % len(path)) seq = "" for n in path: nid, o = int(n[:-1]), n[-1:] assert (o == '+' or o == '-') if o == "+": seq += G.node[nid]['seq'] else: seq += utils.rc(G.node[nid]['seq'])
def comp(G): for node in G.node: G.node[node]['seq']=utils.rc(G.node[node]['seq']) genome2length=dict() #relabel the offsets, determine the length of all genomes in the graph, then l-pos for sample in G.graph['paths']: maxp=0 for node,data in G.nodes(data=True): if sample in data['offsets']: if data['offsets'][sample]+len(data['seq'])>maxp: maxp=data['offsets'][sample]+len(data['seq']) genome2length[sample]=maxp for sample in G.graph['paths']: for node,data in G.nodes(data=True): if sample in data['offsets']: G.node[node]['offsets'][sample]=genome2length[sample]-(G.node[node]['offsets'][sample]+len(data['seq'])) G.reverse(copy=False) return G
def comp(G): for node in G.node: G.node[node]['seq'] = utils.rc(G.node[node]['seq']) genome2length = dict() #relabel the offsets, determine the length of all genomes in the graph, then l-pos for sample in G.graph['paths']: maxp = 0 for node, data in G.nodes(data=True): if sample in data['offsets']: if data['offsets'][sample] + len(data['seq']) > maxp: maxp = data['offsets'][sample] + len(data['seq']) genome2length[sample] = maxp for sample in G.graph['paths']: for node, data in G.nodes(data=True): if sample in data['offsets']: G.node[node]['offsets'][sample] = genome2length[sample] - ( G.node[node]['offsets'][sample] + len(data['seq'])) G.reverse(copy=False) return G
def use_scaffold_paired_info(self, L, additional_prd): long_edges = set() used_paires = set() connect_edges = set() count_correct_scaffolds = 0 count_incorrect_scaffolds = 0 for edge_id, edge in self.es.items(): if edge.length() > L: long_edges.add(edge) for e1 in long_edges: for e2 in long_edges: first_rectangle = e1.diagonals[-1].rectangle second_rectangle = e2.diagonals[0].rectangle e11 = first_rectangle.e1 e12 = first_rectangle.e2 e21 = second_rectangle.e1 e22 = second_rectangle.e2 if (e12.eid, e21.eid) in additional_prd: #or (e11, e22) in additional_prd or (e11, e21) in additional_prd or (e12, e22) in additional_prd: (D, weight, delta) = additional_prd[(e12.eid,e21.eid)][0] if not self.graph.is_connected(first_rectangle.e2.v2, second_rectangle.e1, 10): count_correct_scaffolds +=1 if len(first_rectangle.e2.v2.out) != 0 or len(second_rectangle.e1.v1.inn) != 0: continue used_paires.add((e12.eid, e21.eid)) count_incorrect_scaffolds +=1 if D - first_rectangle.e2.len > 0 and D - first_rectangle.e2.len < 100: print "SHOULD CONNECT", (e1.eid, e2.eid), (e12.eid, e21.eid), D - first_rectangle.e2.len, "\n", first_rectangle.e2.seq[-55:], "\n", second_rectangle.e1.seq[:55] connect_edges.add((e1.eid, e2.eid)) max_eid = self.graph.max_eid self.graph.add_edge(max_eid, e12.v2.vid, e21.v1.vid, self.graph.K + 3, max_eid + 1) self.graph.add_edge(max_eid + 1, e21.conj.v2.vid, e12.conj.v1.vid, self.graph.K + 3, max_eid) seq = first_rectangle.e2.seq[-self.graph.K:] + "NNN" + second_rectangle.e1.seq[:self.graph.K] self.graph.add_seq(max_eid, seq) self.graph.add_seq(max_eid + 1, utils.rc(seq)) seq2 = second_rectangle.conj.e2.seq[-self.graph.K:] + "NNN" + first_rectangle.conj.e1.seq[:self.graph.K] assert seq2 == utils.rc(seq),"\n" + seq2 + "\n" + utils.rc(seq) path_1 = [] path_2 = [] used = set() begin_path = False start_offset = 0 for diag in e1.diagonals: if e11 == diag.rectangle.e2: begin_path = True if begin_path and diag.rectangle.e2 not in used: path_1.append(diag.rectangle.e2) used.add(diag.rectangle.e2) path_1.append(self.graph.es[max_eid]) if e1.diagonals[-1].rectangle.e2.len <= e1.diagonals[-1].offsetc: path_1 = path_1[1:] start_offset = 0 else: start_offset = e1.diagonals[-1].offsetc path_2.append(self.graph.es[max_eid]) path_2.append(e2.diagonals[0].rectangle.e1) used = set() for diag in e2.diagonals: if e22 == diag.rectangle.e1: break if diag.rectangle.e1 not in used: path_2.append(diag.rectange.e1) used.add(diag.rectangle.e1) print "path1", [e.eid for e in path_1] , "path2", [e.eid for e in path_2] #self.add_rectangles_by_path(path_1, path_2, start_offset) self.test_utils.logger.info("count_correct_scaffolds " + str(count_correct_scaffolds) + " " + str(count_incorrect_scaffolds) + " " + str(len(used_paires)) + "\n") return connect_edges
def use_scaffold_paired_info(self, L, additional_prd): long_edges = set() used_paires = set() connect_edges = set() count_correct_scaffolds = 0 count_incorrect_scaffolds = 0 for edge_id, edge in self.es.items(): if edge.length() > L: long_edges.add(edge) for e1 in long_edges: for e2 in long_edges: first_rectangle = e1.diagonals[-1].rectangle second_rectangle = e2.diagonals[0].rectangle e11 = first_rectangle.e1 e12 = first_rectangle.e2 e21 = second_rectangle.e1 e22 = second_rectangle.e2 if ( e12.eid, e21.eid ) in additional_prd: #or (e11, e22) in additional_prd or (e11, e21) in additional_prd or (e12, e22) in additional_prd: (D, weight, delta) = additional_prd[(e12.eid, e21.eid)][0] if not self.graph.is_connected(first_rectangle.e2.v2, second_rectangle.e1, 10): count_correct_scaffolds += 1 if len(first_rectangle.e2.v2.out) != 0 or len( second_rectangle.e1.v1.inn) != 0: continue used_paires.add((e12.eid, e21.eid)) count_incorrect_scaffolds += 1 if D - first_rectangle.e2.len > 0 and D - first_rectangle.e2.len < 100: first_rectangle.e2.seq[ -55:], "\n", second_rectangle.e1.seq[:55] connect_edges.add((e1.eid, e2.eid)) max_eid = self.graph.max_eid self.graph.add_edge(max_eid, e12.v2.vid, e21.v1.vid, self.graph.K + 3, max_eid + 1) self.graph.add_edge(max_eid + 1, e21.conj.v2.vid, e12.conj.v1.vid, self.graph.K + 3, max_eid) seq = first_rectangle.e2.seq[ -self.graph. K:] + "NNN" + second_rectangle.e1.seq[:self.graph. K] self.graph.add_seq(max_eid, seq) self.graph.add_seq(max_eid + 1, utils.rc(seq)) path_1 = [] path_2 = [] used = set() begin_path = False start_offset = 0 for diag in e1.diagonals: if e11 == diag.rectangle.e2: begin_path = True if begin_path and diag.rectangle.e2 not in used: path_1.append(diag.rectangle.e2) used.add(diag.rectangle.e2) path_1.append(self.graph.es[max_eid]) if e1.diagonals[-1].rectangle.e2.len <= e1.diagonals[ -1].offsetc: path_1 = path_1[1:] start_offset = 0 else: start_offset = e1.diagonals[-1].offsetc path_2.append(self.graph.es[max_eid]) path_2.append(e2.diagonals[0].rectangle.e1) used = set() for diag in e2.diagonals: if e22 == diag.rectangle.e1: break if diag.rectangle.e1 not in used: path_2.append(diag.rectangle.e1) used.add(diag.rectangle.e1) self.test_utils.logger.info("count_correct_scaffolds " + str(count_correct_scaffolds) + " " + str(count_incorrect_scaffolds) + " " + str(len(used_paires)) + "\n") return connect_edges
def extract(G,sample): logging.info("Extracting path: %s from graph (%s) of size: (%d,%d)"%(sample,type(G),G.number_of_nodes(),G.number_of_edges())) if sample == "_longest_": #shortcut to extract the "longest" path in terms of sequence if type(G)==nx.MultiDiGraph: sv=utils.MultiGraphToDiGraph(G) for v,t,k in G.edges: G[v][t][k]['weight']=len(G.node[t]['seq'])-G.node[t]['seq'].count("N") if 'seq' in G.node[t] else 0 else: for v,t in G.edges: G[v][t]['weight']=len(G.node[t]['seq'])-G.node[t]['seq'].count("N") if 'seq' in G.node[t] else 0 # p=[] seq="" # e=None # weights=[0] for n in dag_longest_path_custom(G, weight='weight'): # p.append(n) # if e!=None: # if 0 in G[e][n]: # weights.append(G[e][n][0]['weight']) # else: # weights.append(G[e][n]['weight']) seq+=G.node[n]['seq'] # e=n # with open("path.txt",'w') as f: # f.write("total length: %d\n"%sum(weights)) # for n,w in zip(p,weights): # f.write("%s-%d\n"%(n,w)) return seq elif sample not in G.graph['path2id']: logging.fatal("Unknown path: %s, graph contains: %s"%(sample, G.graph['path2id'].keys())) sys.exit(1) else: sid=G.graph['path2id'][sample] sg=[] for n1,n2,d in G.edges(data=True): if sid in d['paths']: sg.append((n1,n2,d)) if len(sg)>0: #G can be a MultiDiGraph, but subgraph should be single edge! subgraph=nx.DiGraph(sg) seq="" path=list(nx.topological_sort(subgraph)) if type(G)==nx.MultiDiGraph: inito=G[path[0]][path[1]][0]['ofrom'] else: inito=G[path[0]][path[1]]['ofrom'] pnode=None for node in path: offset=0 if pnode==None: o=inito else: o=subgraph[pnode][node]['oto'] if 'cigar' in subgraph[pnode][node] and subgraph[pnode][node]['cigar']!='0M': cigar=subgraph[pnode][node]['cigar'] a=re.findall(r'(\d+)(\w)', cigar) for l,t in a: #determine offset within the segment to allow for overlapping segments if t=='M' or t=='I' or t=='S' or t=='P': #source of the edge (pnode) is considered the reference offset+=int(l) if o=="+": s=G.node[node]['seq'] else: s=utils.rc(G.node[node]['seq']) assert(len(s)>=offset) seq+=s[offset:] pnode=node else: #has to be a single node seq="" for n in G: if sid in G.node[n]['offsets']: seq=G.node[n]['seq'] break return seq
def make_graph(self, genome, k): self.K = k kmers = self.__get_kmers_pos(genome, k) visit = set() vid = 0 eid = 0 edges = set() verts = dict() for key in kmers: if key in visit: continue body = [key[-1]] end_vertex = key[1:] while True: next_kmer = extend_forward(end_vertex, kmers) if next_kmer == None: break body.append(next_kmer[-1]) end_vertex = next_kmer[1:] visit.add(next_kmer) visit.add(utils.rc(next_kmer)) begin_vertex = key[:-1] while True: next_kmer = extend_backward(begin_vertex, kmers) if next_kmer == None: break body.insert(0, next_kmer[-1]) begin_vertex = next_kmer[0:-1] visit.add(next_kmer) visit.add(utils.rc(next_kmer)) body = begin_vertex + ''.join(body) if begin_vertex not in verts: begin_ref = self.add_vertex(vid, vid+1) r_end_ref = self.add_vertex(vid+1, vid) verts[begin_vertex] = begin_ref.vid verts[utils.rc(begin_vertex)] = r_end_ref.vid vid +=2 if end_vertex not in verts: end_ref = self.add_vertex(vid, vid+1) r_begin_ref = self.add_vertex(vid+1, vid) verts[end_vertex] = end_ref.vid verts[utils.rc(end_vertex)] = r_begin_ref.vid vid +=2 bv = verts[begin_vertex] ev = verts[end_vertex] rbv = verts[utils.rc(end_vertex)] rev = verts[utils.rc(begin_vertex)] if (bv, ev) not in edges: if (bv,ev) == (rbv, rev) and body == utils.rc(body): self.add_edge(eid, bv, ev, len(body) -k +1 , eid) edges.add((bv,ev)) self.add_seq(eid, body) self.etalon_dist[eid] = kmers[body[:k]] + kmers[utils.rc(body)[:k]] eid += 1 else: self.add_edge(eid, bv, ev, len(body) - k + 1, eid +1) self.add_edge(eid +1, rbv, rev, len(body) -k +1, eid) edges.add((bv,ev)) edges.add((rbv, rev)) self.add_seq(eid, body) self.add_seq(eid +1, utils.rc(body)) self.etalon_dist[eid] = kmers[body[:k]] self.etalon_dist[eid+1] = kmers[utils.rc(body)[:k]] eid += 2
def check(reference, bgraph, K, log, test_util): K = 15 #logstream = open(os.path.join(folder, 'rectangles.log'), 'a') #corr = os.path.join(folder, 'rectangles.corr.diagonals') #wrong = os.path.join(folder, 'rectangles.wrong.diagonals') max_mismatch = 2 # for match() # read reference genome ref = open(reference) header = ref.readline() assert header[0] == '>' genome = '' for line in ref: genome += line.strip() ref.close() rcgenome = utils.rc(genome) def get_index(genome, not_rc): s = genome + genome[:K] d = {} for i in xrange(len(genome)): kmer = s[i:i + K] if not (kmer in d): d[kmer] = set() if not_rc: d[kmer].add(i + 1) else: d[kmer].add(-(len(genome) - i - K + 1)) return d igenome = get_index(genome, True) ircgenome = get_index(rcgenome, False) def find_index_begin(seq): ref_indexs = set() for i in range(len(seq) - K + 1): kmer = seq[i:i + K] if kmer in igenome: ref_index = igenome[kmer] for ind in ref_index: ref_indexs.add(ind - i) if kmer in ircgenome: ref_index = ircgenome[kmer] for ind in ref_index: ref_indexs.add(ind - i) return ref_indexs count_true_be = 0 count_true_etalon_dist_be = 0 count_part_true_be = 0 count_part_true_etalon_dist_be = 0 count_be = 0 covered = [] for be in bgraph.bes: count_be += 1 is_true_be = True is_part_true = False is_true_etalon_dist = True is_part_true_etalon_dist = False for diag in be.diagonals: is_true_diag = test_util.is_true_diagonal(diag) if is_true_diag: is_part_true_etalon_dist = True else: is_true_etalon_dist = False seq1 = diag.rectangle.e1.seq[diag.offseta:diag.offsetc + 55] seq2 = diag.rectangle.e2.seq[diag.offsetb:diag.offsetd + 55] indexs1 = find_index_begin(seq1) indexs2 = find_index_begin(seq2) is_true = False log.write(str(indexs1) + "\n") log.write(str(indexs2) + "\n") log.write(str(diag.d) + "\n") for i1 in indexs1: for i2 in indexs2: if diag.d - 5 <= abs(i1 - i2) <= diag.d + 5: is_true = True covered.append((i1, i1 + len(seq1))) if is_true: is_part_true = True else: is_true_be = False log.write("diag is true " + str(is_true) + "\n") if is_part_true and not is_true_be: count_part_true_be += 1 if is_true_be: count_true_be += 1 if is_part_true_etalon_dist and not is_true_etalon_dist: count_part_true_etalon_dist_be += 1 if is_true_etalon_dist: count_true_etalon_dist_be += 1 #print diag, count_part_true_be, count_true_be log.write("true " + str(count_true_be) + str(" part_true ") + str(count_part_true_be) + " false " + str( count_be - count_true_be - count_part_true_be) + "\n") log.write("etalon true " + str(count_true_etalon_dist_be) + str(" part_true ") + str( count_part_true_etalon_dist_be) + " false " + str( count_be - count_true_etalon_dist_be - count_part_true_etalon_dist_be) + "\n") covered.sort() # print covered missed = 0 end = covered[0][1] for cov in covered: if cov[0] > end: missed += 1 end = cov[1] log.write("missing" + str(missed) + "\n") """def match(genome, seq, pos):
def check(reference, bgraph, K, log, test_util): K = 15 #logstream = open(os.path.join(folder, 'rectangles.log'), 'a') #corr = os.path.join(folder, 'rectangles.corr.diagonals') #wrong = os.path.join(folder, 'rectangles.wrong.diagonals') max_mismatch = 2 # for match() # read reference genome ref = open(reference) header = ref.readline() assert header[0] == '>' genome = '' for line in ref: genome += line.strip() ref.close() rcgenome = utils.rc(genome) def get_index(genome, not_rc): s = genome + genome[:K] d = {} for i in xrange(len(genome)): kmer = s[i:i + K] if not (kmer in d): d[kmer] = set() if not_rc: d[kmer].add(i + 1) else: d[kmer].add(-(len(genome) - i - K + 1)) return d igenome = get_index(genome, True) ircgenome = get_index(rcgenome, False) def find_index_begin(seq): ref_indexs = set() for i in range(len(seq) - K + 1): kmer = seq[i:i + K] if kmer in igenome: ref_index = igenome[kmer] for ind in ref_index: ref_indexs.add(ind - i) if kmer in ircgenome: ref_index = ircgenome[kmer] for ind in ref_index: ref_indexs.add(ind - i) return ref_indexs count_true_be = 0 count_true_etalon_dist_be = 0 count_part_true_be = 0 count_part_true_etalon_dist_be = 0 count_be = 0 covered = [] for be in bgraph.bes: count_be += 1 is_true_be = True is_part_true = False is_true_etalon_dist = True is_part_true_etalon_dist = False for diag in be.diagonals: is_true_diag = test_util.is_true_diagonal(diag) if is_true_diag: is_part_true_etalon_dist = True else: is_true_etalon_dist = False seq1 = diag.rectangle.e1.seq[diag.offseta:diag.offsetc + 55] seq2 = diag.rectangle.e2.seq[diag.offsetb:diag.offsetd + 55] indexs1 = find_index_begin(seq1) indexs2 = find_index_begin(seq2) is_true = False log.write(str(indexs1) + "\n") log.write(str(indexs2) + "\n") log.write(str(diag.d) + "\n") for i1 in indexs1: for i2 in indexs2: if diag.d - 5 <= abs(i1 - i2) <= diag.d + 5: is_true = True covered.append((i1, i1 + len(seq1))) if is_true: is_part_true = True else: is_true_be = False log.write("diag is true " + str(is_true) + "\n") if is_part_true and not is_true_be: count_part_true_be += 1 if is_true_be: count_true_be += 1 if is_part_true_etalon_dist and not is_true_etalon_dist: count_part_true_etalon_dist_be += 1 if is_true_etalon_dist: count_true_etalon_dist_be += 1 #print diag, count_part_true_be, count_true_be log.write("true " + str(count_true_be) + str(" part_true ") + str(count_part_true_be) + " false " + str(count_be - count_true_be - count_part_true_be) + "\n") log.write("etalon true " + str(count_true_etalon_dist_be) + str(" part_true ") + str(count_part_true_etalon_dist_be) + " false " + str(count_be - count_true_etalon_dist_be - count_part_true_etalon_dist_be) + "\n") covered.sort() # print covered missed = 0 end = covered[0][1] for cov in covered: if cov[0] > end: missed += 1 end = cov[1] log.write("missing" + str(missed) + "\n") """def match(genome, seq, pos):