def test_from_list(self): self.assertEqual(TestUnitSegmentEnd.se_s, gfapy.SegmentEnd(["a", "L"])) self.assertEqual(gfapy.SegmentEnd, gfapy.SegmentEnd(["a", "L"]).__class__) self.assertRaises(gfapy.ArgumentError, gfapy.SegmentEnd, ["a", "L", "L"]) gfapy.SegmentEnd(["a", "X"]) # no validation
def _delete_other_links(self, segment_end, other_end, conserve_components=False): segment_end = gfapy.SegmentEnd(segment_end) other_end = gfapy.SegmentEnd(other_end) s = self.try_get_segment(segment_end.segment) for d in s.dovetails_of_end(segment_end.end_type): if not conserve_components or not self.is_cut_link(d): d.disconnect()
def test_equal(self): se2 = gfapy.SegmentEnd(TestUnitSegmentEnd.sym, "L") se3 = gfapy.SegmentEnd(TestUnitSegmentEnd.ref, "R") self.assertEqual(TestUnitSegmentEnd.se_s, se2) self.assertEqual(TestUnitSegmentEnd.se_r, se3) # only name and end_type equivalence is checked, not segment assert (TestUnitSegmentEnd.se_r != TestUnitSegmentEnd.se_s) assert (TestUnitSegmentEnd.se_r.inverted() == TestUnitSegmentEnd.se_s) # equivalence to array assert (TestUnitSegmentEnd.se_s == ["a", "L"]) assert (TestUnitSegmentEnd.se_r == ["a", "R"])
def _junction_junction_paths(self, sn, exclude): retval = [] exclude.append(sn) s = self.segment(sn) for dL in s.dovetails_L: eL = dL.other_end(gfapy.SegmentEnd(s, "L")) if (eL.name in exclude) or (len(eL.segment.dovetails_of_end(eL.end_type)) == 1): retval.append([True, eL, gfapy.SegmentEnd(s, "R"), True]) for dR in s.dovetails_R: eR = dR.other_end(gfapy.SegmentEnd(s, "R")) if (eR.name in exclude) or (len(eR.segment.dovetails_of_end(eR.end_type)) == 1): retval.append([True, gfapy.SegmentEnd(s, "R"), eR.inverted(), True]) return retval
def segment_connected_component(self, segment, visited=None): """Compute the connected component to which a segment belong. Note: only dovetail overlaps are considered as connections Parameters: segment (str, Line) : a segment name or instance Returns: list : a list of segment instances """ if visited is None: visited = set() if isinstance(segment, gfapy.Line): segment_name = segment.name else: segment_name = segment segment = self.segment(segment) visited.add(segment_name) c = set() c.add(segment) for e in ["L", "R"]: self.__traverse_component(gfapy.SegmentEnd(segment, e), c, visited) return list(c)
def is_cut_segment(self, segment): """Does the removal of a segment split a connected component? Note: only dovetail overlaps are considered as connections Parameters: segment (str, Line) : a segment name or instance Returns: bool """ if isinstance(segment, str): segment = self.try_get_segment(segment) if segment._connectivity() in [(0, 0), (0, 1), (1, 0)]: return False start_points = set() for et in ["L", "R"]: for l in segment.dovetails_of_end(et): start_points.add(l.other_end(\ gfapy.SegmentEnd(segment.name, et)).inverted()) cc = [] for start_point in start_points: cc.append(set()) visited = set() visited.add(segment.name) self.__traverse_component(start_point, cc[-1], visited) return any(c != cc[0] for c in cc)
def merge_linear_path(gfa, segpath, enable_tracking=False, merged_name=None, cut_counts=False): """Merge a specified linear path of dovetail overlaps connecting segments. Note: for the parameter usage, see merge_linear_paths(); the only difference is that merged_name can be set to a string (different from 'short'), which will be used as a name for the merged segment. """ if len(segpath) < 2: return gfa segpath = [gfapy.SegmentEnd(s) for s in segpath] #print("Merging path", segpath) merged, first_reversed, last_reversed = create_merged_segment( gfa, segpath, merged_name=merged_name, cut_counts=cut_counts, enable_tracking=enable_tracking) gfa.append(merged) link_merged(gfa, merged.name, segpath[0].inverted(), first_reversed) link_merged(gfa, merged.name, segpath[-1], last_reversed) idx1 = 0 idx2 = None for sn_et in segpath[idx1:idx2]: gfa.segment(sn_et.segment).disconnect() #return gfa """Find and merge linear paths of dovetail overlaps connecting segments.
def test_segment(self): self.assertEqual(TestUnitSegmentEnd.sym, TestUnitSegmentEnd.se_s.segment) self.assertEqual(TestUnitSegmentEnd.ref, TestUnitSegmentEnd.se_r.segment) se2 = gfapy.SegmentEnd(TestUnitSegmentEnd.sym, "R") se2.segment = TestUnitSegmentEnd.ref self.assertEqual(TestUnitSegmentEnd.ref, se2.segment)
def _distribute_links(self, links_distribution_policy, segment_name, copy_names, factor): if factor < 2: return end_type = self._select_distribute_end(links_distribution_policy, segment_name, factor) if end_type is None: return et_links = self.segment(segment_name).dovetails_of_end(end_type) diff = max([len(et_links) - factor, 0]) links_signatures = list([repr(l.other_end(gfapy.SegmentEnd(segment_name, \ end_type))) for l in et_links]) for i, sn in enumerate([segment_name] + copy_names): to_keep = links_signatures[i:i + diff + 1] links = self.segment(sn).dovetails_of_end(end_type).copy() for l in links: l_sig = repr(l.other_end(gfapy.SegmentEnd(sn, end_type))) if l_sig not in to_keep: l.disconnect()
def test_segment_ends_path(self): sep = gfapy.SegmentEndsPath( [gfapy.SegmentEnd("a", "L"), gfapy.SegmentEnd("b", "R")]) self.assertEqual( [gfapy.SegmentEnd("b", "L"), gfapy.SegmentEnd("a", "R")], list(reversed(sep))) self.assertNotEqual( [gfapy.SegmentEnd("b", "L"), gfapy.SegmentEnd("a", "R")], sep) sep.reverse() self.assertEqual( [gfapy.SegmentEnd("b", "L"), gfapy.SegmentEnd("a", "R")], sep)
def __traverse_component(self, segment_end, c, visited): s = segment_end.segment assert (isinstance(s, gfapy.Line)) for l in s.dovetails_of_end(segment_end.end_type): oe = l.other_end(segment_end) sn = oe.name s = oe.segment if sn in visited: continue visited.add(sn) c.add(s) for e in ["L", "R"]: self.__traverse_component(gfapy.SegmentEnd(s, e), c, visited)
def merge_linear_path(self, segpath, redundant_junctions=False, jntag="jn", enable_tracking=False, merged_name=None, cut_counts=False): """Merge a specified linear path of dovetail overlaps connecting segments. Note: for the parameter usage, see merge_linear_paths(); the only difference is that merged_name can be set to a string (different from 'short'), which will be used as a name for the merged segment. """ if len(segpath) < 2: return self if segpath[0] in [True, False]: first_redundant = segpath.pop(0) last_redundant = segpath.pop() else: first_redundant = False last_redundant = False segpath = [gfapy.SegmentEnd(s) for s in segpath] merged, first_reversed, last_reversed = \ self.__create_merged_segment(segpath, redundant_junctions=redundant_junctions, jntag=jntag, merged_name=merged_name,cut_counts=cut_counts, enable_tracking=enable_tracking) self.append(merged) if first_redundant: self._link_duplicated_first(merged, self.segment(segpath[0].segment), first_reversed, jntag) else: self.__link_merged(merged.name, segpath[0].inverted(), first_reversed) if last_redundant: self._link_duplicated_last(merged, self.segment(segpath[-1].segment), last_reversed, jntag) else: self.__link_merged(merged.name, segpath[-1], last_reversed) idx1 = 1 if first_redundant else 0 idx2 = -1 if last_redundant else None for sn_et in segpath[idx1:idx2]: self.segment(sn_et.segment).disconnect() if self._progress: self._progress_log("merge_linear_paths", 0.05) return self
def __traverse_linear_path(self, segment_end, exclude): lst = gfapy.SegmentEndsPath() current = gfapy.SegmentEnd(segment_end) current.segment = self.segment(current.segment) while True: after = current.segment.dovetails_of_end(current.end_type) before = current.segment.dovetails_of_end( gfapy.invert(current.end_type)) if (len(before) == 1 and len(after) == 1) or not lst: lst.append(gfapy.SegmentEnd(current.name, current.end_type)) exclude.add(current.name) current = after[0].other_end(current).inverted() if current.name in exclude: break elif len(before) == 1: lst.append(gfapy.SegmentEnd(current.name, current.end_type)) exclude.add(current.name) break else: break if segment_end.end_type == "L": return list(reversed(lst)) else: return lst
def test_add_links(self): s1 = "S\t1\t*" s2 = "S\t2\t*" l1 = gfapy.Line("L\t1\t+\t2\t+\t12M") l2 = "L\t1\t+\t3\t+\t12M" gfa = gfapy.Gfa() gfa.append(s1) gfa.append(s2) gfa.append(l1) # nothing raised self.assertEqual([l1], gfa.dovetails) self.assertEqual([l1], gfa.segment("1").end_relations("R", ["2", "L"])) self.assertEqual([l1], gfa.segment("2").end_relations("L", ["1", "R"])) self.assertEqual([], gfa.segment("2").end_relations( "R", gfapy.SegmentEnd("1", "L"))) gfa.append(l2) # nothing raised
def to_end(self): """The segment end corresponding to the to_segment field of L lines. Note: The result is meaningful only for dovetails overlaps (GFA1 L lines or GFA2 E lines representing dovetail overlaps). For a L line, the to_orient field is used to compute if the overlap involves the left (5') or right (3') end of the to_segment and the SegmentEnd end_type property is set accordingly to 'L' or 'R'. For a E line, it is first computed which of the sid1/sid2 corresponds to the to_segment field of a L line, then the same computation is done, as for L lines. Returns: gfapy.segment_end.SegmentEnd """ return gfapy.SegmentEnd(self.to_segment, "L" if self.to_orient == "+" else "R")
def _randomly_orient_proven_invertible_segment(self, segment_name): se = gfapy.SegmentEnd([segment_name, "R"]) parts = self._partitioned_links_of(se) if len(parts) == 2: tokeep1_other_end = parts[0][0].other_end(se) tokeep2_other_end = parts[1][0].other_end(se) elif len(parts) == 1 and len(parts[0]) == 2: tokeep1_other_end = parts[0][0].other_end(se) tokeep2_other_end = parts[0][1].other_end(se) else: return if len(tokeep1_other_end.segment.dovetails( tokeep1_other_end.end_type)) < 2: return if len(tokeep2_other_end.segment.dovetails( tokeep2_other_end.end_type)) < 2: return self._delete_other_links(se, tokeep1_other_end) self._delete_other_links(se.inverted(), tokeep2_other_end) self._annotate_random_orientation(segment_name)
def linear_path(self, segment, exclude=None): """Finnd a linear path which contains the specified segment Parameters: segment (str, Line): the segment to analyse exclude : (API private) """ if isinstance(segment, gfapy.Line): segment_name = segment.name else: segment_name = segment segment = self.segment(segment_name) cs = segment._connectivity() if exclude is None: exclude = set() segpath = gfapy.SegmentEndsPath() for i, et in enumerate(["L", "R"]): if cs[i] == 1: exclude.add(segment_name) if len(segpath) > 0: segpath.pop() segpath += self.__traverse_linear_path( gfapy.SegmentEnd(segment, et), exclude) return segpath
def test_end_type(self): self.assertEqual("L", TestUnitSegmentEnd.se_s.end_type) self.assertEqual("R", TestUnitSegmentEnd.se_r.end_type) se2 = gfapy.SegmentEnd(TestUnitSegmentEnd.sym, "L") se2.end_type = "R" self.assertEqual("R", se2.end_type)
def _segment_same_links_both_ends(self, segment_name): e_links = self._link_targets_for_cmp( gfapy.SegmentEnd(segment_name, "R")) b_links = self._link_targets_for_cmp( gfapy.SegmentEnd(segment_name, "L")) return e_links == b_links
def create_merged_segment(gfa, segpath, merged_name=None, enable_tracking=False, cut_counts=False): merged = gfa.try_get_segment(segpath[0].segment).clone() merged_vlevel = merged.vlevel merged.vlevel = 0 total_cut = 0 a = segpath[0] first_reversed = (a.end_type == "L") last_reversed = None if merged_name == "short": merged_name = gfa.unused_name() gfa._add_segment_to_merged(merged, gfa.segment(a.segment), first_reversed, 0, True, enable_tracking=enable_tracking, merged_name=merged_name) #for i in range(len(segpath)-1): # b = gfapy.SegmentEnd(segpath[i+1]).inverted() for s in segpath[1:]: b = gfapy.SegmentEnd(s).inverted() ls = gfa.segment(a.segment).end_relations(a.end_type, b, "dovetails") if len(ls) != 1: msg = "A single link was expected between {}".format(a) + \ "and {}".format(b) + "{} were found".format(len(ls)) raise gfapy.ValueError(msg) l = ls[0] if not l.overlap: cut = 0 else: cut = min(l.overlap.length_on_query(), gfa.segment(b.segment).LN) #elif all(op.code in ["M","="] for op in l.overlap): # cut = sum([len(op) for op in l.overlap]) #else: # raise gfapy.ValueError( # "Merging is only allowed if all operations are M/=") total_cut += cut last_reversed = (b.end_type == "R") gfa._add_segment_to_merged(merged, gfa.segment(b.segment), last_reversed, cut, False, enable_tracking=enable_tracking, merged_name=merged_name) a = gfapy.SegmentEnd(b).inverted() merged.vlevel = merged_vlevel if isinstance(merged.name, list): merged.name = "_".join(merged.name) ortag = merged.get("or") if isinstance(ortag, list): merged.set("or", ",".join(ortag)) if not gfapy.is_placeholder(merged.sequence): merged.sequence = "".join(merged.sequence) if not merged.LN: merged.LN = len(merged.sequence) elif gfa._vlevel > 0 and merged.LN != len(merged.sequence): raise gfapy.InconsistencyError( "Computed sequence length {} ".format(merged.sequence.length) + "and computed LN {} differ".format(merged.LN)) if merged.length is not None: for count_tag in ["KC", "RC", "FC"]: merged.set(count_tag, None) else: factor = 1 if cut_counts: factor = merged.length / (total_cut + merged.length) for count_tag, count in gfa.__sum_of_counts(segpath, factor).items(): merged.set(count_tag, count) return merged, first_reversed, last_reversed
def test_new(self): gfapy.SegmentEnd(TestUnitSegmentEnd.sym, "L") # no validation on creation gfapy.SegmentEnd(TestUnitSegmentEnd.invalid_sym, "X")
def end_relations(self, extremity, segment_end, collection="edges"): return [e for e in getattr(self, collection) if \ (e.other_end(gfapy.SegmentEnd(self, extremity), tolerant=True) == \ segment_end)]
def test_validate(self): TestUnitSegmentEnd.se_s.validate() TestUnitSegmentEnd.se_r.validate() se1 = gfapy.SegmentEnd("a", "X") self.assertRaises(gfapy.ValueError, se1.validate)
class TestUnitSegmentEnd(unittest.TestCase): sym = "a" ref = gfapy.Line("S\ta\t*\txx:Z:1.0") invalid_sym = "a\ta" invalid_ref = [] se_s = gfapy.SegmentEnd(sym, "L") se_r = gfapy.SegmentEnd(ref, "R") se_s_str = "aL" se_r_str = "aR" se_s_sym = "aL" se_r_sym = "aR" def test_new(self): gfapy.SegmentEnd(TestUnitSegmentEnd.sym, "L") # no validation on creation gfapy.SegmentEnd(TestUnitSegmentEnd.invalid_sym, "X") def test_from_list(self): self.assertEqual(TestUnitSegmentEnd.se_s, gfapy.SegmentEnd(["a", "L"])) self.assertEqual(gfapy.SegmentEnd, gfapy.SegmentEnd(["a", "L"]).__class__) self.assertRaises(gfapy.ArgumentError, gfapy.SegmentEnd, ["a", "L", "L"]) gfapy.SegmentEnd(["a", "X"]) # no validation def test_segment(self): self.assertEqual(TestUnitSegmentEnd.sym, TestUnitSegmentEnd.se_s.segment) self.assertEqual(TestUnitSegmentEnd.ref, TestUnitSegmentEnd.se_r.segment) se2 = gfapy.SegmentEnd(TestUnitSegmentEnd.sym, "R") se2.segment = TestUnitSegmentEnd.ref self.assertEqual(TestUnitSegmentEnd.ref, se2.segment) def test_end_type(self): self.assertEqual("L", TestUnitSegmentEnd.se_s.end_type) self.assertEqual("R", TestUnitSegmentEnd.se_r.end_type) se2 = gfapy.SegmentEnd(TestUnitSegmentEnd.sym, "L") se2.end_type = "R" self.assertEqual("R", se2.end_type) def test_name(self): self.assertEqual(TestUnitSegmentEnd.sym, TestUnitSegmentEnd.se_s.name) self.assertEqual(TestUnitSegmentEnd.sym, TestUnitSegmentEnd.se_r.name) def test_validate(self): TestUnitSegmentEnd.se_s.validate() TestUnitSegmentEnd.se_r.validate() se1 = gfapy.SegmentEnd("a", "X") self.assertRaises(gfapy.ValueError, se1.validate) def test_inverted(self): inv_s = TestUnitSegmentEnd.se_s.inverted() self.assertEqual(TestUnitSegmentEnd.se_s.segment, inv_s.segment) self.assertEqual("R", inv_s.end_type) inv_r = TestUnitSegmentEnd.se_r.inverted() self.assertEqual(TestUnitSegmentEnd.se_r.segment, inv_r.segment) self.assertEqual("L", inv_r.end_type) def test_to_s(self): self.assertEqual(TestUnitSegmentEnd.se_s_str, str(TestUnitSegmentEnd.se_s)) self.assertEqual(TestUnitSegmentEnd.se_r_str, str(TestUnitSegmentEnd.se_r)) def test_equal(self): se2 = gfapy.SegmentEnd(TestUnitSegmentEnd.sym, "L") se3 = gfapy.SegmentEnd(TestUnitSegmentEnd.ref, "R") self.assertEqual(TestUnitSegmentEnd.se_s, se2) self.assertEqual(TestUnitSegmentEnd.se_r, se3) # only name and end_type equivalence is checked, not segment assert (TestUnitSegmentEnd.se_r != TestUnitSegmentEnd.se_s) assert (TestUnitSegmentEnd.se_r.inverted() == TestUnitSegmentEnd.se_s) # equivalence to array assert (TestUnitSegmentEnd.se_s == ["a", "L"]) assert (TestUnitSegmentEnd.se_r == ["a", "R"]) #def test_comparison(self): # self.assertEqual(-1, ["a","L"].to_segment_end() <=> ["b","L"].to_segment_end()) # self.assertEqual(0, ["a","L"].to_segment_end() <=> ["a","L"].to_segment_end()) # self.assertEqual(1, ["b","L"].to_segment_end() <=> ["a","L"].to_segment_end()) # self.assertEqual(-1, ["a","L"].to_segment_end() <=> ["a","R"].to_segment_end()) # self.assertEqual(0, ["a","R"].to_segment_end() <=> ["a","R"].to_segment_end()) # self.assertEqual(1, ["a","R"].to_segment_end() <=> ["a","L"].to_segment_end()) def test_segment_ends_path(self): sep = gfapy.SegmentEndsPath( [gfapy.SegmentEnd("a", "L"), gfapy.SegmentEnd("b", "R")]) self.assertEqual( [gfapy.SegmentEnd("b", "L"), gfapy.SegmentEnd("a", "R")], list(reversed(sep))) self.assertNotEqual( [gfapy.SegmentEnd("b", "L"), gfapy.SegmentEnd("a", "R")], sep) sep.reverse() self.assertEqual( [gfapy.SegmentEnd("b", "L"), gfapy.SegmentEnd("a", "R")], sep)