def from_string(cls, string, valid=False): """ Create a numeric array from a string Parameters ---------- string : str valid : optional bool *(default:* **False** *)* If **False**, validate the range of the numeric values, according to the array subtype. If **True** the string is guaranteed to be valid. Raises ------ gfapy.ValueError If any value is not compatible with the subtype. gfapy.TypeError If the subtype code is invalid. Returns ------- gfapy.NumericArray The numeric array """ if not valid: if len(string) == 0: raise gfapy.FormatError( "Numeric array string shall not be empty") if string[-1] == ",": raise gfapy.FormatError( "Numeric array string ends with comma\n" + "String: {}".format(string)) elems = string.split(",") subtype = elems[0] if subtype not in NumericArray.SUBTYPE: raise gfapy.TypeError("Subtype {} unknown".format(subtype)) if subtype != "f": range = NumericArray.SUBTYPE_RANGE[subtype] def gen(): for e in elems[1:]: if subtype != "f": try: e = int(e) except: raise gfapy.ValueError( "Value is not valid: {}\n".format(e) + "Numeric array string: {}".format(string)) if not valid and not (e >= range[0] and e < range[1]): raise gfapy.ValueError( ("NumericArray: " + "value is outside of subtype {0} range\n" + "Value: {1}\n" + "Range: {2}\n" + "Content: {3}").format(subtype, e, repr(range), repr(elems))) yield e else: yield float(e) return cls(list(gen()))
def _from_string(cls, string, valid=False, version="gfa1"): """Create a CIGAR instance from its string representation. Parameters: string (str) valid (bool): If **True** the string is guaranteed to be valid. (Defaults to **False**) version (str): 'gfa1' or 'gfa2' Returns: ~gfapy.alignment.cigar.CIGAR or ~gfapy.alignment.placeholder.AlignmentPlaceholder Raises: ~gfapy.error.FormatError: If the string is not a valid CIGAR string. """ if string == "*": return gfapy.AlignmentPlaceholder() cigar = CIGAR() if not valid: if version == "gfa1": if not re.match(r"^([0-9]+[MIDNSHPX=])+$", string): raise gfapy.FormatError() elif version == "gfa2": if not re.match(r"^([0-9]+[MIDP])+$", string): raise gfapy.FormatError() for m in re.finditer("([0-9]+)([MIDNSHPX=])", string): cigar.append(CIGAR.Operation(int(m.group(1)), m.group(2))) return cigar
def _from_string(cls, string, version = "gfa2", valid = False): """ Parses an alignment field Parameters ---------- string : str The string to parse. version : str GFA version (gfa1 or gfa2) If *gfa1*, then CIGARs and Placeholders are supported. If *gfa2*, also Traces are supported. Defaults to *gfa2*. valid : bool If *True*, the string is guaranteed to be valid, and further checks are skipped. Defaults to *False*. Returns ------- gfapy.CIGAR or gfapy.Trace or gfapy.AlignentPlaceholder Raises ------ gfapy.FormatError If the content of the field cannot be parsed. gfapy.VersionError If a wrong value is provided for the version parameter. """ if version != "gfa1" and version != "gfa2": raise gfapy.VersionError( "Version error: {}".format(repr(version))) first = True for char in string: if first: if char.isdigit(): first = False continue elif char == "*" and len(string) == 1: return gfapy.AlignmentPlaceholder() else: if char.isdigit(): continue elif char == ",": if version == "gfa2": t = gfapy.Trace._from_string(string) if not valid: t.validate() return t else: raise gfapy.FormatError( "Trace alignments are not allowed in GFA1: {}" .format(repr(string))) elif char in ["M","I","D","P"] or (char in ["=","X","S","H","N"] and version == "gfa1"): return gfapy.CIGAR._from_string(string, valid=valid, version=version) break raise gfapy.FormatError("Alignment field contains invalid data {}" .format(repr(string)))
def validate_encoded(string): if not re.match(r"^[!-)+-<>-~][!-~]*$", string): raise gfapy.FormatError( "{} is not a valid GFA1 segment name\n".format(repr(string)) + "(it does not match the regular expression [!-)+-<>-~][!-~]*") elif re.search(r"[+-],", string): raise gfapy.FormatError( "{} is not a valid GFA1 segment name\n".format(repr(string)) + "(it contains + or - followed by ,)")
def validate_encoded(string): if not re.match(r"^[!-~]+$", string): raise gfapy.FormatError( "{} is not a valid custom record type\n".format(repr(string)) + "(it contains spaces and/or non-printable characters)") elif string in ["E", "G", "F", "O", "U", "H", "#", "S"]: raise gfapy.FormatError( "{} is not a valid custom record type\n".format(repr(string)) + "(it is a predefined GFA2 record type)")
def validate_decoded(obj): if isinstance(obj, gfapy.OrientedLine): if not re.match("^[!-~]+$", obj.name): raise gfapy.FormatError( "{} is not a valid oriented GFA2 identifier\n".format(repr(obj.name))) if obj.orient != "+" and obj.orient != "-": raise gfapy.FormatError( "{} is not a valid orientation\n".format(repr(obj.orient))) else: raise gfapy.TypeError( "the class {} is incompatible with the datatype\n" .format(obj.__class__.__name__)+ "(accepted classes: gfapy.OrientedLine)")
def validate_decoded(iterable): for elem in iterable: if not isinstance(elem, gfapy.OrientedLine): raise gfapy.TypeError( "the list contains an object of class {}\n".format(type(elem))+ "(accepted classes: gfapy.OrientedLine)") elem.validate() if not re.match(r"^[!-~]+$", elem.name): raise gfapy.FormatError( "the list contains an invalid GFA2 identifier {}\n".format(elem.name)+ "(it contains spaces and/or non-printable characters)") if not elem.orient in ["+", "-"]: raise gfapy.FormatError( "{} is not a valid orientation".format(elem.orient))
def _parse_gfa_tag(tag): """ Parses a GFA tag in the form **xx:d:content** into its components. The **content** is not decoded (see :func:`_parse_gfa_field`). Parameters ---------- tag : str the GFA tag to parse Raises ------ gfapy.FormatError if the string does not represent a valid GFA tag Returns ------- list of (str, gfapy.Field.FIELD_DATATYPE) the parsed content of the field """ match = re.match(r"^([A-Za-z][A-Za-z0-9]):([AifZJHB]):(.+)$", tag) if match: return [match.group(1), match.group(2), match.group(3)] else: raise gfapy.FormatError("Expected GFA tag, found: {}".format( repr(tag)))
def _substring_type(self, begpos, endpos): """Type of substring (pfx, sfx, whole, internal) given start and end pos. Analyzes the begin and end position and determine if the substring is the whole string, or a (possibly empty) other substring, ie a prefix, a suffix, or an internal alignment. """ if gfapy.posvalue(begpos) > gfapy.posvalue(endpos): raise gfapy.ValueError( "Line: {}\n".format(str(self))+ "begin > end: {}$ > {}".format(gfapy.posvalue(begpos), gfapy.posvalue(endpos))) if gfapy.isfirstpos(begpos): if gfapy.isfirstpos(endpos): return ("pfx", True) elif gfapy.islastpos(endpos): return ("whole", False) else: return ("pfx", False) elif gfapy.islastpos(begpos): if not gfapy.islastpos(endpos): raise gfapy.FormatError( "Line: {}\n".format(str(self))+ "Wrong use of $ marker\n"+ "{} >= {}$".format(gfapy.posvalue(endpos), gfapy.posvalue(begpos))) return ("sfx", True) else: if gfapy.islastpos(endpos): return ("sfx", False) else: return ("internal", gfapy.posvalue(begpos) == gfapy.posvalue(endpos))
def _from_string(cls, string): try: return Trace([int(v) for v in string.split(",")]) except: raise gfapy.FormatError( "string does not encode" + " a valid trace alignment: {}".format(string))
def set_datatype(self, fieldname, datatype): """ Set the datatype of a tag. If an existing tag datatype is changed, its content may become invalid (call **validate_field** if necessary). Parameters ---------- fieldname : str The field name (it is not required that the field exists already) datatype : gfapy.Field.FIELD_DATATYPE The datatype. Raises ------ gfapy.ArgumentError If **datatype** is not a valid datatype for tags. """ if self._is_predefined_tag(fieldname): if self.get_datatype(fieldname) != datatype: raise gfapy.RuntimeError( "Cannot set the datatype of {} to {}\n".format( fieldname, datatype) + "The datatype of a predefined tag cannot be changed") elif not self._is_valid_custom_tagname(fieldname) and self.vlevel > 0: raise gfapy.FormatError( "{} is not a valid custom tag name".format(fieldname)) if datatype not in gfapy.Field.TAG_DATATYPE: raise gfapy.ArgumentError("Unknown datatype: {}".format(datatype)) self._datatype[fieldname] = datatype
def validate_encoded(string): if not re.match(r"^(\*|[-+]?[0-9]+)$", string): raise gfapy.FormatError( "{} does not represent a valid optional integer value\n".format( repr(string)) + "(it is not * and does not match the regular expression [-+]?[0-9]+)" )
def _validate_tagnames_and_types(self): for n in self.tagnames: if self._is_predefined_tag(n): self._validate_predefined_tag_type(n, self._field_datatype(n)) elif not self._is_valid_custom_tagname(n): raise gfapy.FormatError("Custom tags must be lower case\n" + "Found: {}".format(n))
def validate_decoded(iterable): for elem in iterable: elem = gfapy.OrientedLine(elem) elem.validate() if not re.match(r"^[!-)+-<>-~][!-~]*$", elem.name): raise gfapy.FormatError( "{} is not a valid GFA1 segment name\n".format(elem.name)+ "(it does not match [!-)+-<>-~][!-~]*)")
def _validate_tagnames_and_types(self): for n in self.tagnames: if self._is_predefined_tag(n): self._validate_predefined_tag_type(n, self._field_datatype(n)) elif not self._is_valid_custom_tagname(n): raise gfapy.FormatError( "Custom tag names must consist in a letter " + "and a digit or two letters\nFound: {}".format(n))
def validate_encoded(string): if not re.match( r"^(\*|(([0-9]+[MIDNSHPX=])+))(,(\*|(([0-9]+[MIDNSHPX=])+)))*$", string): raise gfapy.FormatError( "{} is not a comma separated list of * or CIGARs\n".format( repr(string)) + "(CIGAR strings must match ([0-9]+[MIDNSHPX=])+)")
def validate_encoded(string): if not re.match(r"^[!-)+-<>-~][!-~]*[+-](,[!-)+-<>-~][!-~]*[+-])+$", string): raise gfapy.FormatError( "{} is not a valid list of GFA1 segment names ".format(repr(string))+ "and orientations\n"+ "(the segment names must match [!-)+-<>-~][!-~]*;\n"+ " the orientations must be + or -;\n"+ " the list must be comma-separated "+ "NameOrient,NameOrient[,NameOrient...])")
def decode(string): if string == "*": return gfapy.Placeholder() else: try: return int(string) except: raise gfapy.FormatError( "the string does not represent a valid integer")
def validate_encoded(string): if not re.match( r"^(f(,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+|[CSI](,\+?[0-9]+)+|[csi](,[-+]?[0-9]+)+)$", string): raise gfapy.FormatError( "{} is not a valid numeric array string\n".format(repr(string)) + "(it must be one of [fcsiCSI] followed by a comma-separated list of:" + " for f: floats; for csi: signed integers; for CSI: unsigned integers)" )
def _init_comment_data(data): if isinstance(data, list) and (data[0] != "#"): # unproperly splitten, rejoin data = "\t".join(data) if isinstance(data, str): match = re.match(r"^#(\s*)(.*)$", data) if match is None: raise gfapy.FormatError("Comment lines must begin with #\n" + "Line: {}".format(data)) data = ["#", match.group(2), match.group(1)] return data
def _initialize_positional_fields(self, strings): if strings[0] != self.RECORD_TYPE and self.RECORD_TYPE != "\n": raise gfapy.FormatError( "Record type of records of " + "class {} must be {} ({} found)".format( self.__class__, self.RECORD_TYPE, strings[0])) if self.version is None: raise gfapy.AssertionError("Bug found, please report\n" + "strings: {}".format(repr(strings))) if (self.vlevel >= 1) and (len(strings) - 1 < self._n_positional_fields): raise gfapy.FormatError( "{} positional fields expected, ".format( self._n_positional_fields) + "{} found\n{}".format(len(strings) - 1, repr(strings))) for i, n in enumerate(self.POSFIELDS): self._init_field_value(n, self.__class__.DATATYPE[n], strings[i + 1], errmsginfo=strings)
def __validate_line(self): if isinstance(self.line, gfapy.Line): string = self.line.name elif isinstance(self.line, str): string = self.line else: raise gfapy.TypeError( "Invalid class ({}) for line reference ({})".format( self.line.__class__, self.line)) if not re.match(r"^[!-~]+$", string): raise gfapy.FormatError( "{} is not a valid GFA identifier\n".format(repr(string)) + "(it contains spaces or non-printable characters)")
def set(self, fieldname, value): """Set the value of a field. If a datatype for a new custom tag is not set, the default for the value assigned to the field will be used (e.g. J for Hashes, i for Integer, etc). Parameters ---------- fieldname : str The name of the field to set. (positional field, predefined tag (uppercase) or custom tag (lowercase)) Raises ------ gfapy.FormatError If **fieldname** is not a valid predefined or custom tag name (and **validate["tags"]**). Returns ------- object **value** """ if fieldname in self._data or self._is_predefined_tag(fieldname): return self._set_existing_field(fieldname, value) elif fieldname in self.__class__.FIELD_ALIAS: return self.set(self.__class__.FIELD_ALIAS[fieldname], value) elif self.virtual: raise gfapy.RuntimeError("Virtual lines do not have tags") elif (self.vlevel == 0) or self._is_valid_custom_tagname(fieldname): self._define_field_methods(fieldname) if self._datatype.get(fieldname, None) is not None: return self._set_existing_field(fieldname, value) elif value is not None: self._datatype[ fieldname] = gfapy.Field._get_default_gfa_tag_datatype( value) self._data[fieldname] = value return self._data[fieldname] else: raise gfapy.FormatError( "{} is not a positional field,".format(fieldname) + "an existing tag, an alias, a predefined tag or a valid custom tag\n" + "positional fields: {}\n".format(", ".join( self.positional_fieldnames)) + "existing tags: {}\n".format(", ".join(self.tagnames)) + "aliases: {}\n".format(", ".join( self.__class__.FIELD_ALIAS.keys())) + "predefined tags: {}\n".format(", ".join( self.__class__.PREDEFINED_TAGS)))
def _from_string(cls, string, valid=False): if string[-1] == "$": return cls(int(string[:-1]), valid=valid) else: try: v = int(string) except: raise gfapy.FormatError( "LastPos value has a wrong format: {}".format(string)) if not valid: if v < 0: raise gfapy.ValueError("LastPos value shall be >= 0," + " {} found".format(v)) return v
def _subclass(data): n_positionals = len(data) - 1 for i in range(len(data) - 1, 0, -1): if not re.search(r"^..:.:.*$", data[i]): break n_positionals = i - 1 if n_positionals == 2: return gfapy.line.segment.GFA1 elif n_positionals == 3: return gfapy.line.segment.GFA2 else: raise gfapy.FormatError( "Wrong number of positional fields for " "segment line; GFA1=2, GFA2=3, found={}\n".format( n_positionals))
def validate_decoded(obj): if isinstance(obj, list): for elem in obj: if isinstance(elem, gfapy.Line): elem = str(elem.name) elif not isinstance(elem, str): raise gfapy.TypeError("the list contains an obj of class {}\n". format(elem.__class__.__name__) + "(accepted classes: str, gfapy.Line)") if not re.match("^[!-~]+$", elem): raise gfapy.FormatError( "the list contains an invalid GFA2 identifier ({})\n". format(repr(string)) + "(it contains spaces and/or non-printable characters)") else: raise gfapy.TypeError( "the class {} is incompatible with the datatype\n".format( obj.__class__.__name__) + "(accepted classes: list)")
def _from_list(cls, array, version = "gfa2", valid = True): """ Converts an alignment array into a specific list type Parameters ---------- array : list The alignment array. version : str GFA version (gfa1 or gfa2) If *gfa1*, then CIGARs and Placeholders are supported. If *gfa2*, also Traces are supported. Defaults to *gfa2*. valid : bool If *True*, the list is guaranteed to be valid, and further checks are skipped. Defaults to *False*. Returns ------- gfapy.CIGAR or gfapy.Trace """ if version != "gfa1" and version != "gfa2": raise gfapy.VersionError( "Version error: {}".format(repr(version))) if not array: return gfapy.AlignmentPlaceholder() elif isinstance(array[0], int): if version == "gfa2": return gfapy.Trace(array) else: raise gfapy.VersionError( "Trace alignments are not allowed in GFA1: {}".format(repr(array))) elif isinstance(array[0], gfapy.CIGAR.Operation): return gfapy.CIGAR(array) else: raise gfapy.FormatError( "Array does not represent a valid alignment field: {}" .format(repr(array)))
def decode(string): try: return int(string) except: raise gfapy.FormatError( "the string does not represent a valid integer")
def validate_encoded(string): if not re.match("^[!-~]+$", string): raise gfapy.FormatError( "{} is not a valid GFA2 optional identifier\n".format(repr( string)) + "(it contains spaces or non-printable characters)")
def validate_encoded(string): if string.find("\n") != -1 or string.find("\t") != -1: raise gfapy.FormatError( "{} is not a valid field content\n".format(repr(string)) + "(it contains newlines and/or tabs)")