Beispiel #1
0
 def _has_eql_fields(self, refline, ignore_fields = None):
   assert(isinstance(refline, gfapy.Line))
   if ignore_fields is None:
     ignore_fields = []
   self._dealias_fieldnames(ignore_fields)
   if ("record_type" not in ignore_fields) \
       and (self.record_type != refline.record_type):
     return False
   fieldnames = refline.positional_fieldnames + refline.tagnames
   fieldnames = [i for i in fieldnames if i not in ignore_fields]
   if "name" in ignore_fields:
     name_field = refline.__class__.NAME_FIELD
     if name_field in fieldnames:
       fieldnames.remove(name_field)
   for fieldname in fieldnames:
     refvalue = refline.get(fieldname)
     if gfapy.is_placeholder(refvalue):
       continue
     value = self.get(fieldname)
     if value is None:
       return False
     if gfapy.is_placeholder(value):
       continue
     if value != refvalue:
       return False
   return True
Beispiel #2
0
 def test_is_placeholder(self):
     for a in [
             TestApiAlignment.cigar_empty, TestApiAlignment.trace_empty,
             TestApiAlignment.placeholder, TestApiAlignment.placeholder_s
     ]:
         assert (gfapy.is_placeholder(a))
     for a in [
             TestApiAlignment.cigar_1, TestApiAlignment.cigar_1_s,
             TestApiAlignment.trace_1, TestApiAlignment.trace_1_s
     ]:
         assert (not gfapy.is_placeholder(a))
Beispiel #3
0
 def test_extensions(self):
     g = gfapy.Gfa(version="gfa2", vlevel=0)
     MetagenomicAssignment(["M", "*", "N12", "C", "SC:i:20"])
     sA = gfapy.Line("S\tA\t1000\t*")
     g.append(sA)
     tB12 = gfapy.Line("T\tB12_c")
     g.append(tB12)
     m1 = gfapy.Line("M\t1\ttaxon:123\tA\tSC:i:40\txx:Z:cjaks536")
     g.append(m1)
     m2 = gfapy.Line("M\t2\ttaxon:123\tB\txx:Z:cga5r5cs")
     g.append(m2)
     sB = gfapy.Line("S\tB\t1000\t*")
     g.append(sB)
     mx = gfapy.Line("M\t*\tB12_c\tB\tSC:i:20")
     g.append(mx)
     t123 = gfapy.Line("T\ttaxon:123\tUL:Z:http://www.taxon123.com")
     g.append(t123)
     self.assertEqual(MetagenomicAssignment, m1.__class__)
     self.assertEqual(Taxon, tB12.__class__)
     self.assertEqual("1", m1.mid)
     assert (gfapy.is_placeholder(mx.mid))
     self.assertEqual(t123, m1.tid)
     self.assertEqual(sA, m1.sid)
     self.assertEqual("cjaks536", m1.xx)
     self.assertEqual([m2, mx], sB.metagenomic_assignments)
     self.assertEqual([m1, m2], t123.metagenomic_assignments)
     self.assertEqual("taxon:123", t123.tid)
     self.assertEqual("http://www.taxon123.com", t123.UL)
Beispiel #4
0
 def _to_gfa1_a(self):
     """List of the field content of the line in GFA1.
 """
     at = self._alignment_type
     if at == "I":
         raise gfapy.RuntimeError(
             "Conversion of edge line from GFA2 to GFA1 failed\n" +
             "Edge represents an internal overlap:\n" +
             "Edge line: {}\n".format(str(self)))
     a = [at]
     if self._is_sid1_from():
         ol1 = self.get("sid1")
         ol2 = self.get("sid2")
     else:
         ol1 = self.get("sid2")
         ol2 = self.get("sid1")
     a.append(ol1.name)
     a.append(ol1.orient)
     a.append(ol2.name)
     a.append(ol2.orient)
     if self._alignment_type == "C":
         a.append(str(self.pos))
     try:
         self.overlap.validate(version="gfa1")
     except:
         raise gfapy.RuntimeError(
             "Conversion of edge line from GFA2 to GFA1 failed\n" +
             "Overlap is invalid or not compatible with GFA1\n" +
             "Edge line: {}\n".format(str(self)))
     a.append(str(self.overlap))
     if not gfapy.is_placeholder(self.eid):
         a.append(gfapy.Field._to_gfa_tag(self.eid, "ID", datatype="Z"))
     for fn in self.tagnames:
         a.append(self.field_to_s(fn, tag=True))
     return a
Beispiel #5
0
def rc(sequence, valid=False, rna=False):
    """Compute the reverse complement of a nucleotidic sequence.

  All characters in the IUPAC extended alphabet are supported
  (ACGTUBVHDRYKMSWN). The character ".-=", spaces and newlines
  are left as they are. The case of each character is preserved.

  Returns
    str : reverse complement, without newlines and spaces;
         	"*" if string is "*"

  Parameters:
    sequence (str) : the sequence to reverse-complement
    valid (bool) : if True, the reverse complement of any invalid character
      is the character itself
    rna (bool) : if True, t/T are substituted by u/U in the output

  Raises:
    gfapy.error.ValueError : if valid is False and an invalid character
      (not in the IUPAC extended alphabet for nucleotide sequences, .-=,
      spaces or newline) is found
  """
    if gfapy.is_placeholder(sequence): return sequence

    def fun(c):
        wcc = WCC.get(c, c if valid else None)
        if not wcc:
            raise gfapy.ValueError(
                "{}: no Watson-Crick complement for {}".format(sequence, c))
        return wcc

    retval = "".join(reversed([fun(c) for c in sequence]))
    if rna:
        retval = retval.translate(str.maketrans("tT", "uU"))
    return retval
Beispiel #6
0
    def select(self, dict_or_line):
        """Select all lines which respect a chriterion.

    The chriterion is expressed by the argument, which is either a line
    instance or a dictionary. If it is a dictionary, it shall contain
    pairs of fieldnames/values and the method returns all lines
    where the mentioned fieldnames have the corresponding values.
    If it is a line, it is compared with the lines of the same type
    in the Gfa instance and lines with the same field values
    are returned (undefined placeholder values are thereby not compared).
    """
        is_dict = isinstance(dict_or_line, dict)
        name = dict_or_line.get("name",
                                None) if is_dict else dict_or_line.get("name")
        if name is not None and not gfapy.is_placeholder(name):
            collection = [self.__line_by_name(name)]
        else:
            if is_dict:
                record_type = dict_or_line.get("record_type", None)
            else:
                record_type = dict_or_line.record_type
            collection = self.__collection_for_select(record_type)
        method = "_has_field_values" if is_dict else "_has_eql_fields"
        return [line for line in collection \
            if getattr(line, method)(dict_or_line, ["record_type","name"])]
Beispiel #7
0
 def _register_line(self, gfa_line):
     self._api_private_check_gfa_line(gfa_line, "_register_line")
     storage_key = gfa_line.__class__.STORAGE_KEY
     if storage_key == "merge":
         self._records[gfa_line.record_type]._merge(gfa_line)
     elif storage_key == "name":
         if gfa_line.record_type not in self._records:
             self._records[gfa_line.record_type] = {}
         key = gfa_line.name
         if gfapy.is_placeholder(key):
             key = id(gfa_line)
         elif key.isdigit():
             keynum = int(key)
             if keynum > self._max_int_name:
                 self._max_int_name = keynum
         self._records[gfa_line.record_type][key] = gfa_line
     elif storage_key == "external":
         if gfa_line.external.line not in self._records[
                 gfa_line.record_type]:
             self._records[gfa_line.record_type][
                 gfa_line.external.line] = {}
         self._records[gfa_line.record_type][\
             gfa_line.external.line][id(gfa_line)] = gfa_line
     elif storage_key is None:
         if gfa_line.record_type not in self._records:
             self._records[gfa_line.record_type] = {}
         self._records[gfa_line.record_type][id(gfa_line)] = gfa_line
Beispiel #8
0
 def try_get_line(self, l):
     """Call line() and raise an exception is the line is not found."""
     gfa_line = self.line(l)
     if gfa_line is None:
         if gfapy.is_placeholder(l):
             raise gfapy.ValueError(
                 "'*' is a placeholder and not a valid name for a line")
         else:
             raise gfapy.NotFoundError("No line found with ID {}".format(l))
     return gfa_line
Beispiel #9
0
    def _undef_overlaps(self):
        """
    Are the overlaps a single "*"? This is a compact representation of
    a linear path where all CIGARs are "*".

    Returns
    -------
    bool
    """
        return len(self.overlaps) == 1 and gfapy.is_placeholder(
            self.overlaps[0])
Beispiel #10
0
def parse_gfa(filename):
    """Returns a nx.DiGraph representation of a GFA1 or GFA2 file.

    NOTE that, at present, we only visualize nodes and edges in the GFA graph.
    A TODO is displaying all or most of the relevant information in these
    graphs, like GfaViz does: see
    https://github.com/marbl/MetagenomeScope/issues/147 for discussion of this.
    """
    digraph = nx.DiGraph()
    gfa_graph = gfapy.Gfa.from_file(filename)

    # Add nodes ("segments") to the DiGraph
    for node in gfa_graph.segments:
        if node.length is None:
            raise ValueError("Found a node without a specified length: "
                             "{}".format(node.name))
        if node.name[0] == "-":
            raise ValueError("Node IDs in the input assembly graph cannot "
                             'start with the "-" character.')
        sequence_gc = None
        if not gfapy.is_placeholder(node.sequence):
            sequence_gc = gc_content(node.sequence)[0]
        # Add both a positive and negative node.
        for name in (node.name, negate_node_id(node.name)):
            digraph.add_node(name, length=node.length, gc_content=sequence_gc)

    # Now, add edges to the DiGraph
    for edge in gfa_graph.edges:
        # Set edge_tuple to the edge's explicitly specified orientation
        # This code is a bit verbose, but that was the easiest way to write it
        # I could think of
        if edge.from_orient == "-":
            src_id = negate_node_id(edge.from_name)
        else:
            src_id = edge.from_name
        if edge.to_orient == "-":
            tgt_id = negate_node_id(edge.to_name)
        else:
            tgt_id = edge.to_name
        edge_tuple = (src_id, tgt_id)
        digraph.add_edge(*edge_tuple)

        # Now, try to add the complement of the edge (done manually, since
        # .complement() isn't available for GFA2 edges as of writing)
        complement_tuple = (negate_node_id(tgt_id), negate_node_id(src_id))

        # Don't add an edge twice if its complement is itself (as in the
        # loop.gfa test case)
        if complement_tuple != edge_tuple:
            digraph.add_edge(*complement_tuple)
    return digraph
Beispiel #11
0
 def validate_length(self):
     """
 Raises
 ------
 gfapy.InconsistencyError
   If sequence length and LN tag are not consistent.
 """
     if not gfapy.is_placeholder(self.sequence) and "LN" in self.tagnames:
         if self.LN != len(self.sequence):
             raise gfapy.InconsistencyError(
                 "Segment: {}\n".format(str(self)) +
                 "Length in LN tag ({}) ".format(self.LN) +
                 "is different from length of sequence field ({})".format(
                     len(self.sequence)))
Beispiel #12
0
 def validate_positions(self):
     "Checks that positions suffixed by $ are the last position of segments"
     if self.is_connected():
         seg = self.get("sid")
         seq = seg.sequence
         if not gfapy.is_placeholder(seq):
             seqlen = len(seq)
             for sfx in ["beg", "end"]:
                 fn = "s_" + sfx
                 pos = self.get(fn)
                 if gfapy.islastpos(pos):
                     if pos != seqlen:
                         raise gfapy.InconsistencyError(
                             "Fragment: {}\n".format(str(self)) +
                             "Field {}: $ after ".format(str(fn)) +
                             "non-last position ({})\n".format(str(pos)) +
                             "Segment: {}".format(str(seg)))
Beispiel #13
0
 def __new__(cls, *args, **kargs):
   """Create an instance of an alignment field class."""
   if args[0] is None or \
       gfapy.is_placeholder(args[0]):
     return gfapy.AlignmentPlaceholder()
   if len(args) > 1:
     raise gfapy.ArgumentError("The Alignment() constructor requires "+
         "a single positional argument, {} found".format(len(args)))
   if isinstance(args[0], gfapy.CIGAR) or \
       isinstance(args[0], gfapy.Trace):
     return args[0]
   if isinstance(args[0], str):
     return Alignment._from_string(*args, **kargs)
   elif isinstance(args[0], list):
     return Alignment._from_list(*args, **kargs)
   else:
     raise gfapy.ArgumentError("Cannot create an alignment "+
         "from an instance of the class {}".format(type(args[0])))
Beispiel #14
0
    def line(self, l):
        """Search a line in a GFA.

    If the argument is a line, it is returned. If it is a string,
    it is used as a line identifier, and the line with that identifier
    is returned. If no line has the identifier, None is returned.

    Parameters:
      l (str, gfapy.Line)
    """
        if gfapy.is_placeholder(l):
            return None
        elif isinstance(l, gfapy.Line):
            return l
        elif isinstance(l, str):
            return self.__line_by_name(l)
        else:
            return None
Beispiel #15
0
 def _has_field_values(self, hsh, ignore_fields = None):
   assert(isinstance(hsh, dict))
   if ignore_fields is None:
     ignore_fields = []
   if ("record_type" in hsh) and ("record_type" not in ignore_fields) \
       and (self.record_type != hsh["record_type"]):
     return False
   ignore_fields.append("record_type")
   fieldnames = [i for i in hsh.keys() if i not in ignore_fields]
   for fieldname in fieldnames:
     value = self.get(fieldname)
     if value is None:
       return False
     if gfapy.is_placeholder(value):
       continue
     if value != hsh[fieldname] and \
         (self.field_to_s(fieldname) != hsh[fieldname]):
       return False
   return True
Beispiel #16
0
 def _to_gfa1_a(self):
     a = ["P"]
     if gfapy.is_placeholder(self.name):
         raise gfapy.ValueError("Conversion to GFA1 failed\n" +
                                "The path name is a placeholder\t" +
                                "Line: {}".format(self))
     a.append(self.name)
     segment_names = []
     for oline in self.captured_segments:
         gfapy.Field._validate_gfa_field(oline.name, "segment_name_gfa1")
         segment_names.append(str(oline))
     a.append(",".join(segment_names))
     overlaps = []
     for oline in self.captured_edges:
         gfapy.Field._validate_gfa_field(oline.line.overlap,
                                         "alignment_gfa1")
         overlaps.append(str(oline.line.overlap))
     a.append(",".join(overlaps))
     return a
Beispiel #17
0
    def length(self):
        """
    Returns
    -------
    int
      Value of LN tag, if segment has LN tag.
    int
      Sequence length if no LN and sequence not "*".
    None
      If sequence is "*".

    See Also
    --------
    try_get_length
    """
        if self.LN:
            return self.LN
        elif not gfapy.is_placeholder(self.sequence):
            return len(self.sequence)
        else:
            return None
Beispiel #18
0
 def _unregister_line(self, gfa_line):
     self._api_private_check_gfa_line(gfa_line, "unregister_line")
     rt = gfa_line.record_type
     if rt == "H":
         raise gfapy.AssertionError("Bug found, please report\n" +
                                    "gfa_line: {}".format(gfa_line))
     collection = self._records[rt]
     storage_key = gfa_line.__class__.STORAGE_KEY
     if storage_key == "name":
         name = gfa_line.name
         if gfapy.is_placeholder(name):
             name = id(gfa_line)
         collection.pop(name)
     elif storage_key == "external":
         subkey = gfa_line.external.name
         collection = collection[subkey]
         collection.pop(id(gfa_line))
         if not collection:
             self._records[rt].pop(subkey)
     else:
         collection.pop(id(gfa_line))
Beispiel #19
0
 def __eq__(self, other):
     return gfapy.is_placeholder(other)
Beispiel #20
0
def create_merged_segment(gfa,
                          segpath,
                          merged_name=None,
                          enable_tracking=False,
                          cut_counts=False):
    merged = gfa.try_get_segment(segpath[0].segment).clone()
    merged_vlevel = merged.vlevel
    merged.vlevel = 0
    total_cut = 0
    a = segpath[0]
    first_reversed = (a.end_type == "L")
    last_reversed = None
    if merged_name == "short":
        merged_name = gfa.unused_name()
    gfa._add_segment_to_merged(merged,
                               gfa.segment(a.segment),
                               first_reversed,
                               0,
                               True,
                               enable_tracking=enable_tracking,
                               merged_name=merged_name)
    #for i in range(len(segpath)-1):
    #  b = gfapy.SegmentEnd(segpath[i+1]).inverted()
    for s in segpath[1:]:
        b = gfapy.SegmentEnd(s).inverted()
        ls = gfa.segment(a.segment).end_relations(a.end_type, b, "dovetails")
        if len(ls) != 1:
            msg = "A single link was expected between {}".format(a) + \
                  "and {}".format(b) + "{} were found".format(len(ls))
            raise gfapy.ValueError(msg)
        l = ls[0]
        if not l.overlap:
            cut = 0
        else:
            cut = min(l.overlap.length_on_query(), gfa.segment(b.segment).LN)
        #elif all(op.code in ["M","="] for op in l.overlap):
        #  cut = sum([len(op) for op in l.overlap])
        #else:
        #  raise gfapy.ValueError(
        #      "Merging is only allowed if all operations are M/=")
        total_cut += cut
        last_reversed = (b.end_type == "R")
        gfa._add_segment_to_merged(merged,
                                   gfa.segment(b.segment),
                                   last_reversed,
                                   cut,
                                   False,
                                   enable_tracking=enable_tracking,
                                   merged_name=merged_name)
        a = gfapy.SegmentEnd(b).inverted()
    merged.vlevel = merged_vlevel
    if isinstance(merged.name, list):
        merged.name = "_".join(merged.name)
    ortag = merged.get("or")
    if isinstance(ortag, list):
        merged.set("or", ",".join(ortag))
    if not gfapy.is_placeholder(merged.sequence):
        merged.sequence = "".join(merged.sequence)
        if not merged.LN:
            merged.LN = len(merged.sequence)
        elif gfa._vlevel > 0 and merged.LN != len(merged.sequence):
            raise gfapy.InconsistencyError(
                "Computed sequence length {} ".format(merged.sequence.length) +
                "and computed LN {} differ".format(merged.LN))
    if merged.length is not None:
        for count_tag in ["KC", "RC", "FC"]:
            merged.set(count_tag, None)
    else:
        factor = 1
        if cut_counts:
            factor = merged.length / (total_cut + merged.length)
        for count_tag, count in gfa.__sum_of_counts(segpath, factor).items():
            merged.set(count_tag, count)
    return merged, first_reversed, last_reversed
Beispiel #21
0
 def test_line_placeholder(self):
     assert (not gfapy.is_placeholder(TestUnitLineEquivalence.a))
     assert (not gfapy.is_placeholder(TestUnitLineEquivalence.b))
Beispiel #22
0
 def _add_segment_to_merged(self,
                            merged,
                            segment,
                            is_reversed,
                            cut,
                            init,
                            enable_tracking=False,
                            merged_name=None):
     n = segment.name
     if is_reversed:
         s = gfapy.sequence.rc(segment.sequence)[cut:]
         if enable_tracking:
             n = self._reverse_segment_name(segment.name, "_")
             rn = self._reverse_pos_array(segment.rn, segment.LN)
             mp = self._reverse_pos_array(segment.mp, segment.LN)
     else:
         s = segment.sequence[cut:]
         if enable_tracking:
             rn = segment.rn
             mp = segment.mp
     if enable_tracking:
         if not mp and segment.LN:
             mp = [1, segment.LN]
         if segment.get("or") is None:
             o = n
         elif is_reversed:
             o = self._reverse_segment_name(segment.get("or"), ",")
         else:
             o = segment.get("or")
     if init:
         merged.sequence = [s]
         if merged_name:
             merged.name = [merged_name]
         else:
             merged.name = [n]
         merged.LN = segment.LN
         if enable_tracking:
             merged.rn = rn
             merged.set("or", [o])
             merged.mp = mp
     else:
         if gfapy.is_placeholder(segment.sequence):
             merged.sequence = gfapy.Placeholder()
         else:
             merged.sequence.append(s)
         if not merged_name:
             merged.name.append(n)
         if merged.LN:
             if enable_tracking:
                 if rn:
                     rn = [pos - cut + merged.LN for pos in rn]
                     if not merged.rn:
                         merged.rn = rn
                     else:
                         merged.rn += rn
                 if mp and merged.mp:
                     merged.mp += [pos - cut + merged.LN for pos in mp]
             if segment.LN:
                 merged.LN += (segment.LN - cut)
             else:
                 merged.LN = None
         elif enable_tracking:
             merged.mp = None
         if enable_tracking:
             if not merged.get("or"):
                 merged.set("or", [o])
             else:
                 merged.get("or").append(o)