def _has_eql_fields(self, refline, ignore_fields = None): assert(isinstance(refline, gfapy.Line)) if ignore_fields is None: ignore_fields = [] self._dealias_fieldnames(ignore_fields) if ("record_type" not in ignore_fields) \ and (self.record_type != refline.record_type): return False fieldnames = refline.positional_fieldnames + refline.tagnames fieldnames = [i for i in fieldnames if i not in ignore_fields] if "name" in ignore_fields: name_field = refline.__class__.NAME_FIELD if name_field in fieldnames: fieldnames.remove(name_field) for fieldname in fieldnames: refvalue = refline.get(fieldname) if gfapy.is_placeholder(refvalue): continue value = self.get(fieldname) if value is None: return False if gfapy.is_placeholder(value): continue if value != refvalue: return False return True
def test_is_placeholder(self): for a in [ TestApiAlignment.cigar_empty, TestApiAlignment.trace_empty, TestApiAlignment.placeholder, TestApiAlignment.placeholder_s ]: assert (gfapy.is_placeholder(a)) for a in [ TestApiAlignment.cigar_1, TestApiAlignment.cigar_1_s, TestApiAlignment.trace_1, TestApiAlignment.trace_1_s ]: assert (not gfapy.is_placeholder(a))
def test_extensions(self): g = gfapy.Gfa(version="gfa2", vlevel=0) MetagenomicAssignment(["M", "*", "N12", "C", "SC:i:20"]) sA = gfapy.Line("S\tA\t1000\t*") g.append(sA) tB12 = gfapy.Line("T\tB12_c") g.append(tB12) m1 = gfapy.Line("M\t1\ttaxon:123\tA\tSC:i:40\txx:Z:cjaks536") g.append(m1) m2 = gfapy.Line("M\t2\ttaxon:123\tB\txx:Z:cga5r5cs") g.append(m2) sB = gfapy.Line("S\tB\t1000\t*") g.append(sB) mx = gfapy.Line("M\t*\tB12_c\tB\tSC:i:20") g.append(mx) t123 = gfapy.Line("T\ttaxon:123\tUL:Z:http://www.taxon123.com") g.append(t123) self.assertEqual(MetagenomicAssignment, m1.__class__) self.assertEqual(Taxon, tB12.__class__) self.assertEqual("1", m1.mid) assert (gfapy.is_placeholder(mx.mid)) self.assertEqual(t123, m1.tid) self.assertEqual(sA, m1.sid) self.assertEqual("cjaks536", m1.xx) self.assertEqual([m2, mx], sB.metagenomic_assignments) self.assertEqual([m1, m2], t123.metagenomic_assignments) self.assertEqual("taxon:123", t123.tid) self.assertEqual("http://www.taxon123.com", t123.UL)
def _to_gfa1_a(self): """List of the field content of the line in GFA1. """ at = self._alignment_type if at == "I": raise gfapy.RuntimeError( "Conversion of edge line from GFA2 to GFA1 failed\n" + "Edge represents an internal overlap:\n" + "Edge line: {}\n".format(str(self))) a = [at] if self._is_sid1_from(): ol1 = self.get("sid1") ol2 = self.get("sid2") else: ol1 = self.get("sid2") ol2 = self.get("sid1") a.append(ol1.name) a.append(ol1.orient) a.append(ol2.name) a.append(ol2.orient) if self._alignment_type == "C": a.append(str(self.pos)) try: self.overlap.validate(version="gfa1") except: raise gfapy.RuntimeError( "Conversion of edge line from GFA2 to GFA1 failed\n" + "Overlap is invalid or not compatible with GFA1\n" + "Edge line: {}\n".format(str(self))) a.append(str(self.overlap)) if not gfapy.is_placeholder(self.eid): a.append(gfapy.Field._to_gfa_tag(self.eid, "ID", datatype="Z")) for fn in self.tagnames: a.append(self.field_to_s(fn, tag=True)) return a
def rc(sequence, valid=False, rna=False): """Compute the reverse complement of a nucleotidic sequence. All characters in the IUPAC extended alphabet are supported (ACGTUBVHDRYKMSWN). The character ".-=", spaces and newlines are left as they are. The case of each character is preserved. Returns str : reverse complement, without newlines and spaces; "*" if string is "*" Parameters: sequence (str) : the sequence to reverse-complement valid (bool) : if True, the reverse complement of any invalid character is the character itself rna (bool) : if True, t/T are substituted by u/U in the output Raises: gfapy.error.ValueError : if valid is False and an invalid character (not in the IUPAC extended alphabet for nucleotide sequences, .-=, spaces or newline) is found """ if gfapy.is_placeholder(sequence): return sequence def fun(c): wcc = WCC.get(c, c if valid else None) if not wcc: raise gfapy.ValueError( "{}: no Watson-Crick complement for {}".format(sequence, c)) return wcc retval = "".join(reversed([fun(c) for c in sequence])) if rna: retval = retval.translate(str.maketrans("tT", "uU")) return retval
def select(self, dict_or_line): """Select all lines which respect a chriterion. The chriterion is expressed by the argument, which is either a line instance or a dictionary. If it is a dictionary, it shall contain pairs of fieldnames/values and the method returns all lines where the mentioned fieldnames have the corresponding values. If it is a line, it is compared with the lines of the same type in the Gfa instance and lines with the same field values are returned (undefined placeholder values are thereby not compared). """ is_dict = isinstance(dict_or_line, dict) name = dict_or_line.get("name", None) if is_dict else dict_or_line.get("name") if name is not None and not gfapy.is_placeholder(name): collection = [self.__line_by_name(name)] else: if is_dict: record_type = dict_or_line.get("record_type", None) else: record_type = dict_or_line.record_type collection = self.__collection_for_select(record_type) method = "_has_field_values" if is_dict else "_has_eql_fields" return [line for line in collection \ if getattr(line, method)(dict_or_line, ["record_type","name"])]
def _register_line(self, gfa_line): self._api_private_check_gfa_line(gfa_line, "_register_line") storage_key = gfa_line.__class__.STORAGE_KEY if storage_key == "merge": self._records[gfa_line.record_type]._merge(gfa_line) elif storage_key == "name": if gfa_line.record_type not in self._records: self._records[gfa_line.record_type] = {} key = gfa_line.name if gfapy.is_placeholder(key): key = id(gfa_line) elif key.isdigit(): keynum = int(key) if keynum > self._max_int_name: self._max_int_name = keynum self._records[gfa_line.record_type][key] = gfa_line elif storage_key == "external": if gfa_line.external.line not in self._records[ gfa_line.record_type]: self._records[gfa_line.record_type][ gfa_line.external.line] = {} self._records[gfa_line.record_type][\ gfa_line.external.line][id(gfa_line)] = gfa_line elif storage_key is None: if gfa_line.record_type not in self._records: self._records[gfa_line.record_type] = {} self._records[gfa_line.record_type][id(gfa_line)] = gfa_line
def try_get_line(self, l): """Call line() and raise an exception is the line is not found.""" gfa_line = self.line(l) if gfa_line is None: if gfapy.is_placeholder(l): raise gfapy.ValueError( "'*' is a placeholder and not a valid name for a line") else: raise gfapy.NotFoundError("No line found with ID {}".format(l)) return gfa_line
def _undef_overlaps(self): """ Are the overlaps a single "*"? This is a compact representation of a linear path where all CIGARs are "*". Returns ------- bool """ return len(self.overlaps) == 1 and gfapy.is_placeholder( self.overlaps[0])
def parse_gfa(filename): """Returns a nx.DiGraph representation of a GFA1 or GFA2 file. NOTE that, at present, we only visualize nodes and edges in the GFA graph. A TODO is displaying all or most of the relevant information in these graphs, like GfaViz does: see https://github.com/marbl/MetagenomeScope/issues/147 for discussion of this. """ digraph = nx.DiGraph() gfa_graph = gfapy.Gfa.from_file(filename) # Add nodes ("segments") to the DiGraph for node in gfa_graph.segments: if node.length is None: raise ValueError("Found a node without a specified length: " "{}".format(node.name)) if node.name[0] == "-": raise ValueError("Node IDs in the input assembly graph cannot " 'start with the "-" character.') sequence_gc = None if not gfapy.is_placeholder(node.sequence): sequence_gc = gc_content(node.sequence)[0] # Add both a positive and negative node. for name in (node.name, negate_node_id(node.name)): digraph.add_node(name, length=node.length, gc_content=sequence_gc) # Now, add edges to the DiGraph for edge in gfa_graph.edges: # Set edge_tuple to the edge's explicitly specified orientation # This code is a bit verbose, but that was the easiest way to write it # I could think of if edge.from_orient == "-": src_id = negate_node_id(edge.from_name) else: src_id = edge.from_name if edge.to_orient == "-": tgt_id = negate_node_id(edge.to_name) else: tgt_id = edge.to_name edge_tuple = (src_id, tgt_id) digraph.add_edge(*edge_tuple) # Now, try to add the complement of the edge (done manually, since # .complement() isn't available for GFA2 edges as of writing) complement_tuple = (negate_node_id(tgt_id), negate_node_id(src_id)) # Don't add an edge twice if its complement is itself (as in the # loop.gfa test case) if complement_tuple != edge_tuple: digraph.add_edge(*complement_tuple) return digraph
def validate_length(self): """ Raises ------ gfapy.InconsistencyError If sequence length and LN tag are not consistent. """ if not gfapy.is_placeholder(self.sequence) and "LN" in self.tagnames: if self.LN != len(self.sequence): raise gfapy.InconsistencyError( "Segment: {}\n".format(str(self)) + "Length in LN tag ({}) ".format(self.LN) + "is different from length of sequence field ({})".format( len(self.sequence)))
def validate_positions(self): "Checks that positions suffixed by $ are the last position of segments" if self.is_connected(): seg = self.get("sid") seq = seg.sequence if not gfapy.is_placeholder(seq): seqlen = len(seq) for sfx in ["beg", "end"]: fn = "s_" + sfx pos = self.get(fn) if gfapy.islastpos(pos): if pos != seqlen: raise gfapy.InconsistencyError( "Fragment: {}\n".format(str(self)) + "Field {}: $ after ".format(str(fn)) + "non-last position ({})\n".format(str(pos)) + "Segment: {}".format(str(seg)))
def __new__(cls, *args, **kargs): """Create an instance of an alignment field class.""" if args[0] is None or \ gfapy.is_placeholder(args[0]): return gfapy.AlignmentPlaceholder() if len(args) > 1: raise gfapy.ArgumentError("The Alignment() constructor requires "+ "a single positional argument, {} found".format(len(args))) if isinstance(args[0], gfapy.CIGAR) or \ isinstance(args[0], gfapy.Trace): return args[0] if isinstance(args[0], str): return Alignment._from_string(*args, **kargs) elif isinstance(args[0], list): return Alignment._from_list(*args, **kargs) else: raise gfapy.ArgumentError("Cannot create an alignment "+ "from an instance of the class {}".format(type(args[0])))
def line(self, l): """Search a line in a GFA. If the argument is a line, it is returned. If it is a string, it is used as a line identifier, and the line with that identifier is returned. If no line has the identifier, None is returned. Parameters: l (str, gfapy.Line) """ if gfapy.is_placeholder(l): return None elif isinstance(l, gfapy.Line): return l elif isinstance(l, str): return self.__line_by_name(l) else: return None
def _has_field_values(self, hsh, ignore_fields = None): assert(isinstance(hsh, dict)) if ignore_fields is None: ignore_fields = [] if ("record_type" in hsh) and ("record_type" not in ignore_fields) \ and (self.record_type != hsh["record_type"]): return False ignore_fields.append("record_type") fieldnames = [i for i in hsh.keys() if i not in ignore_fields] for fieldname in fieldnames: value = self.get(fieldname) if value is None: return False if gfapy.is_placeholder(value): continue if value != hsh[fieldname] and \ (self.field_to_s(fieldname) != hsh[fieldname]): return False return True
def _to_gfa1_a(self): a = ["P"] if gfapy.is_placeholder(self.name): raise gfapy.ValueError("Conversion to GFA1 failed\n" + "The path name is a placeholder\t" + "Line: {}".format(self)) a.append(self.name) segment_names = [] for oline in self.captured_segments: gfapy.Field._validate_gfa_field(oline.name, "segment_name_gfa1") segment_names.append(str(oline)) a.append(",".join(segment_names)) overlaps = [] for oline in self.captured_edges: gfapy.Field._validate_gfa_field(oline.line.overlap, "alignment_gfa1") overlaps.append(str(oline.line.overlap)) a.append(",".join(overlaps)) return a
def length(self): """ Returns ------- int Value of LN tag, if segment has LN tag. int Sequence length if no LN and sequence not "*". None If sequence is "*". See Also -------- try_get_length """ if self.LN: return self.LN elif not gfapy.is_placeholder(self.sequence): return len(self.sequence) else: return None
def _unregister_line(self, gfa_line): self._api_private_check_gfa_line(gfa_line, "unregister_line") rt = gfa_line.record_type if rt == "H": raise gfapy.AssertionError("Bug found, please report\n" + "gfa_line: {}".format(gfa_line)) collection = self._records[rt] storage_key = gfa_line.__class__.STORAGE_KEY if storage_key == "name": name = gfa_line.name if gfapy.is_placeholder(name): name = id(gfa_line) collection.pop(name) elif storage_key == "external": subkey = gfa_line.external.name collection = collection[subkey] collection.pop(id(gfa_line)) if not collection: self._records[rt].pop(subkey) else: collection.pop(id(gfa_line))
def __eq__(self, other): return gfapy.is_placeholder(other)
def create_merged_segment(gfa, segpath, merged_name=None, enable_tracking=False, cut_counts=False): merged = gfa.try_get_segment(segpath[0].segment).clone() merged_vlevel = merged.vlevel merged.vlevel = 0 total_cut = 0 a = segpath[0] first_reversed = (a.end_type == "L") last_reversed = None if merged_name == "short": merged_name = gfa.unused_name() gfa._add_segment_to_merged(merged, gfa.segment(a.segment), first_reversed, 0, True, enable_tracking=enable_tracking, merged_name=merged_name) #for i in range(len(segpath)-1): # b = gfapy.SegmentEnd(segpath[i+1]).inverted() for s in segpath[1:]: b = gfapy.SegmentEnd(s).inverted() ls = gfa.segment(a.segment).end_relations(a.end_type, b, "dovetails") if len(ls) != 1: msg = "A single link was expected between {}".format(a) + \ "and {}".format(b) + "{} were found".format(len(ls)) raise gfapy.ValueError(msg) l = ls[0] if not l.overlap: cut = 0 else: cut = min(l.overlap.length_on_query(), gfa.segment(b.segment).LN) #elif all(op.code in ["M","="] for op in l.overlap): # cut = sum([len(op) for op in l.overlap]) #else: # raise gfapy.ValueError( # "Merging is only allowed if all operations are M/=") total_cut += cut last_reversed = (b.end_type == "R") gfa._add_segment_to_merged(merged, gfa.segment(b.segment), last_reversed, cut, False, enable_tracking=enable_tracking, merged_name=merged_name) a = gfapy.SegmentEnd(b).inverted() merged.vlevel = merged_vlevel if isinstance(merged.name, list): merged.name = "_".join(merged.name) ortag = merged.get("or") if isinstance(ortag, list): merged.set("or", ",".join(ortag)) if not gfapy.is_placeholder(merged.sequence): merged.sequence = "".join(merged.sequence) if not merged.LN: merged.LN = len(merged.sequence) elif gfa._vlevel > 0 and merged.LN != len(merged.sequence): raise gfapy.InconsistencyError( "Computed sequence length {} ".format(merged.sequence.length) + "and computed LN {} differ".format(merged.LN)) if merged.length is not None: for count_tag in ["KC", "RC", "FC"]: merged.set(count_tag, None) else: factor = 1 if cut_counts: factor = merged.length / (total_cut + merged.length) for count_tag, count in gfa.__sum_of_counts(segpath, factor).items(): merged.set(count_tag, count) return merged, first_reversed, last_reversed
def test_line_placeholder(self): assert (not gfapy.is_placeholder(TestUnitLineEquivalence.a)) assert (not gfapy.is_placeholder(TestUnitLineEquivalence.b))
def _add_segment_to_merged(self, merged, segment, is_reversed, cut, init, enable_tracking=False, merged_name=None): n = segment.name if is_reversed: s = gfapy.sequence.rc(segment.sequence)[cut:] if enable_tracking: n = self._reverse_segment_name(segment.name, "_") rn = self._reverse_pos_array(segment.rn, segment.LN) mp = self._reverse_pos_array(segment.mp, segment.LN) else: s = segment.sequence[cut:] if enable_tracking: rn = segment.rn mp = segment.mp if enable_tracking: if not mp and segment.LN: mp = [1, segment.LN] if segment.get("or") is None: o = n elif is_reversed: o = self._reverse_segment_name(segment.get("or"), ",") else: o = segment.get("or") if init: merged.sequence = [s] if merged_name: merged.name = [merged_name] else: merged.name = [n] merged.LN = segment.LN if enable_tracking: merged.rn = rn merged.set("or", [o]) merged.mp = mp else: if gfapy.is_placeholder(segment.sequence): merged.sequence = gfapy.Placeholder() else: merged.sequence.append(s) if not merged_name: merged.name.append(n) if merged.LN: if enable_tracking: if rn: rn = [pos - cut + merged.LN for pos in rn] if not merged.rn: merged.rn = rn else: merged.rn += rn if mp and merged.mp: merged.mp += [pos - cut + merged.LN for pos in mp] if segment.LN: merged.LN += (segment.LN - cut) else: merged.LN = None elif enable_tracking: merged.mp = None if enable_tracking: if not merged.get("or"): merged.set("or", [o]) else: merged.get("or").append(o)