def _unjsonify(x, isattributes=False): """Convert JSON string to an ordered defaultdict.""" if isattributes: obj = simplejson.loads(x) return feature.dict_class(obj) return simplejson.loads(x)
def _split_keyvals(keyval_str, dialect=None): """ Given the string attributes field of a GFF-like line, split it into an attributes dictionary and a "dialect" dictionary which contains information needed to reconstruct the original string. Lots of logic here to handle all the corner cases. If `dialect` is None, then do all the logic to infer a dialect from this attribute string. Otherwise, use the provided dialect (and return it at the end). """ infer_dialect = False if dialect is None: # Make a copy of default dialect so it can be modified as needed dialect = copy.copy(constants.dialect) infer_dialect = True quals = feature.dict_class() if not keyval_str: return quals, dialect # If a dialect was provided, then use that directly. if not infer_dialect: if dialect["trailing semicolon"]: keyval_str = keyval_str.rstrip(";") parts = keyval_str.split(dialect["field separator"]) kvsep = dialect["keyval separator"] if dialect["leading semicolon"]: pieces = [] for p in parts: if p and p[0] == ";": p = p[1:] pieces.append(p.strip().split(kvsep)) key_vals = [(p[0], " ".join(p[1:])) for p in pieces] if dialect["fmt"] == "gff3": key_vals = [p.split(kvsep) for p in parts] else: leadingsemicolon = dialect["leading semicolon"] pieces = [] for i, p in enumerate(parts): if i == 0 and leadingsemicolon: p = p[1:] pieces.append(p.strip().split(kvsep)) key_vals = [(p[0], " ".join(p[1:])) for p in pieces] quoted = dialect["quoted GFF2 values"] for item in key_vals: # Easy if it follows spec if len(item) == 2: key, val = item # Only key provided? else: assert len(item) == 1, item key = item[0] val = "" try: quals[key] except KeyError: quals[key] = [] if quoted: if len(val) > 0 and val[0] == '"' and val[-1] == '"': val = val[1:-1] if val: # TODO: if there are extra commas for a value, just use empty # strings # quals[key].extend([v for v in val.split(',') if v]) vals = val.split(",") quals[key].extend(vals) return quals, dialect # If we got here, then we need to infer the dialect.... # # Reset the order to an empty list so that it will only be populated with # keys that are found in the file. dialect["order"] = [] # ensembl GTF has trailing semicolon if keyval_str[-1] == ";": keyval_str = keyval_str[:-1] dialect["trailing semicolon"] = True # GFF2/GTF has a semicolon with at least one space after it. # Spaces can be on both sides (e.g. wormbase) # GFF3 works with no spaces. # So split on the first one we can recognize... for sep in (" ; ", "; ", ";"): parts = keyval_str.split(sep) if len(parts) > 1: dialect["field separator"] = sep break # Is it GFF3? They have key-vals separated by "=" if gff3_kw_pat.match(parts[0]): key_vals = [p.split("=") for p in parts] dialect["fmt"] = "gff3" dialect["keyval separator"] = "=" # Otherwise, key-vals separated by space. Key is first item. else: dialect["keyval separator"] = " " pieces = [] for p in parts: # Fix misplaced semicolons in keys in some GFF2 files if p and p[0] == ";": p = p[1:] dialect["leading semicolon"] = True pieces.append(p.strip().split(" ")) key_vals = [(p[0], " ".join(p[1:])) for p in pieces] for item in key_vals: # Easy if it follows spec if len(item) == 2: key, val = item # Only key provided? else: assert len(item) == 1, item key = item[0] val = "" # Is the key already in there? if key in quals: dialect["repeated keys"] = True else: quals[key] = [] # Remove quotes in GFF2 if len(val) > 0 and val[0] == '"' and val[-1] == '"': val = val[1:-1] dialect["quoted GFF2 values"] = True if val: # TODO: if there are extra commas for a value, just use empty # strings # quals[key].extend([v for v in val.split(',') if v]) vals = val.split(",") if (len(vals) > 1) and dialect["repeated keys"]: raise helpers.AttributeStringError( "Internally inconsistent attributes formatting: " "some have repeated keys, some do not." ) quals[key].extend(vals) # keep track of the order of keys dialect["order"].append(key) # for key, vals in quals.items(): # # TODO: urllib.unquote breaks round trip invariance for "hybrid1.gff3" # test file. This is because the "Note" field has %xx escape chars, # but "Dbxref" has ":" which, if everything were consistent, should # have also been escaped. # # (By the way, GFF3 spec says only literal use of \t, \n, \r, %, and # control characters should be encoded) # # Solution 1: don't unquote # Solution 2: store, along with each attribute, whether or not it # should be quoted later upon reconstruction # Solution 3: don't care about invariance # unquoted = [urllib.unquote(v) for v in vals] # quals[key] = vals if (dialect["keyval separator"] == " ") and (dialect["quoted GFF2 values"]): dialect["fmt"] = "gtf" return quals, dialect