Esempio n. 1
0
def _split_keyvals(keyval_str, dialect=None):
    """
    Given the string attributes field of a GFF-like line, split it into an
    attributes dictionary and a "dialect" dictionary which contains information
    needed to reconstruct the original string.

    Lots of logic here to handle all the corner cases.

    If `dialect` is None, then do all the logic to infer a dialect from this
    attribute string.

    Otherwise, use the provided dialect (and return it at the end).
    """
    infer_dialect = False
    if dialect is None:
        # Make a copy of default dialect so it can be modified as needed
        dialect = copy.copy(constants.dialect)
        infer_dialect = True
    from gffutils import feature
    quals = feature.dict_class()
    if not keyval_str:
        return quals, dialect

    # If a dialect was provided, then use that directly.
    if not infer_dialect:
        if dialect['trailing semicolon']:
            keyval_str = keyval_str.rstrip(';')

        parts = keyval_str.split(dialect['field separator'])

        kvsep = dialect['keyval separator']
        if dialect['leading semicolon']:
            pieces = []
            for p in parts:
                if p and p[0] == ';':
                    p = p[1:]
                pieces.append(p.strip().split(kvsep))
                key_vals = [(p[0], " ".join(p[1:])) for p in pieces]

        if dialect['fmt'] == 'gff3':
            key_vals = [p.split(kvsep) for p in parts]
        else:
            leadingsemicolon = dialect['leading semicolon']
            pieces = []
            for i, p in enumerate(parts):
                if i == 0 and leadingsemicolon:
                    p = p[1:]
                pieces.append(p.strip().split(kvsep))
                key_vals = [(p[0], " ".join(p[1:])) for p in pieces]

        quoted = dialect['quoted GFF2 values']
        for item in key_vals:
            # Easy if it follows spec
            if len(item) == 2:
                key, val = item

            # Only key provided?
            elif len(item) == 1:
                key = item[0]
                val = ''

            else:
                key = item[0]
                val = dialect['keyval separator'].join(item[1:])

            try:
                quals[key]
            except KeyError:
                quals[key] = []

            if quoted:
                if (len(val) > 0 and val[0] == '"' and val[-1] == '"'):
                    val = val[1:-1]

            if val:
                # TODO: if there are extra commas for a value, just use empty
                # strings
                # quals[key].extend([v for v in val.split(',') if v])
                vals = val.split(',')
                quals[key].extend(vals)

        return quals, dialect

    # If we got here, then we need to infer the dialect....
    #
    # Reset the order to an empty list so that it will only be populated with
    # keys that are found in the file.
    dialect['order'] = []

    # ensembl GTF has trailing semicolon
    if keyval_str[-1] == ';':
        keyval_str = keyval_str[:-1]
        dialect['trailing semicolon'] = True

    # GFF2/GTF has a semicolon with at least one space after it.
    # Spaces can be on both sides (e.g. wormbase)
    # GFF3 works with no spaces.
    # So split on the first one we can recognize...
    for sep in (' ; ', '; ', ';'):
        parts = keyval_str.split(sep)
        if len(parts) > 1:
            dialect['field separator'] = sep
            break

    # Is it GFF3?  They have key-vals separated by "="
    if gff3_kw_pat.match(parts[0]):
        key_vals = [p.split('=') for p in parts]
        dialect['fmt'] = 'gff3'
        dialect['keyval separator'] = '='

    # Otherwise, key-vals separated by space.  Key is first item.
    else:
        dialect['keyval separator'] = " "
        pieces = []
        for p in parts:
            # Fix misplaced semicolons in keys in some GFF2 files
            if p and p[0] == ';':
                p = p[1:]
                dialect['leading semicolon'] = True
            pieces.append(p.strip().split(' '))
        key_vals = [(p[0], " ".join(p[1:])) for p in pieces]

    for item in key_vals:

        # Easy if it follows spec
        if len(item) == 2:
            key, val = item

        # Only key provided?
        elif len(item) == 1:
            key = item[0]
            val = ''

        # Pathological cases where values of a key have within them the key-val
        # separator, e.g.,
        #  Alias=SGN-M1347;ID=T0028;Note=marker name(s): T0028 SGN-M1347 |identity=99.58|escore=2e-126
        else:
            key = item[0]
            val = dialect['keyval separator'].join(item[1:])

        # Is the key already in there?
        if key in quals:
            dialect['repeated keys'] = True
        else:
            quals[key] = []

        # Remove quotes in GFF2
        if len(val) > 0 and val[0] == '"' and val[-1] == '"':
            val = val[1:-1]
            dialect['quoted GFF2 values'] = True
        if val:
            # TODO: if there are extra commas for a value, just use empty
            # strings
            # quals[key].extend([v for v in val.split(',') if v])
            vals = val.split(',')
            if (len(vals) > 1) and dialect['repeated keys']:
                raise AttributeStringError(
                    "Internally inconsistent attributes formatting: "
                    "some have repeated keys, some do not.")
            quals[key].extend(vals)

        # keep track of the order of keys
        dialect['order'].append(key)

    #for key, vals in quals.items():
    #
    # TODO: urllib.unquote breaks round trip invariance for "hybrid1.gff3"
    # test file.  This is because the "Note" field has %xx escape chars,
    # but "Dbxref" has ":" which, if everything were consistent, should
    # have also been escaped.
    #
    # (By the way, GFF3 spec says only literal use of \t, \n, \r, %, and
    # control characters should be encoded)
    #
    # Solution 1: don't unquote
    # Solution 2: store, along with each attribute, whether or not it
    #             should be quoted later upon reconstruction
    # Solution 3: don't care about invariance

    # unquoted = [urllib.unquote(v) for v in vals]

    #quals[key] = vals

    if ((dialect['keyval separator'] == ' ')
            and (dialect['quoted GFF2 values'])):
        dialect['fmt'] = 'gtf'

    return quals, dialect
Esempio n. 2
0
def test_empty_split_keyvals():
    attrs, dialect = parser._split_keyvals(keyval_str=None)
    assert attrs == feature.dict_class()
    assert dialect == constants.dialect
Esempio n. 3
0
def _split_keyvals(keyval_str, dialect=None):
    """
    Given the string attributes field of a GFF-like line, split it into an
    attributes dictionary and a "dialect" dictionary which contains information
    needed to reconstruct the original string.

    Lots of logic here to handle all the corner cases.

    If `dialect` is None, then do all the logic to infer a dialect from this
    attribute string.

    Otherwise, use the provided dialect (and return it at the end).
    """
    infer_dialect = False
    if dialect is None:
        # Make a copy of default dialect so it can be modified as needed
        dialect = copy.copy(constants.dialect)
        infer_dialect = True
    from gffutils import feature
    quals = feature.dict_class()
    if not keyval_str:
        return quals, dialect

    # If a dialect was provided, then use that directly.
    if not infer_dialect:
        if dialect['trailing semicolon']:
            keyval_str = keyval_str.rstrip(';')

        parts = keyval_str.split(dialect['field separator'])

        kvsep = dialect['keyval separator']
        if dialect['leading semicolon']:
            pieces = []
            for p in parts:
                if p and p[0] == ';':
                    p = p[1:]
                pieces.append(p.strip().split(kvsep))
                key_vals = [(p[0], " ".join(p[1:])) for p in pieces]

        if dialect['fmt'] == 'gff3':
            key_vals = [p.split(kvsep) for p in parts]
        else:
            leadingsemicolon = dialect['leading semicolon']
            pieces = []
            for i, p in enumerate(parts):
                if i == 0 and leadingsemicolon:
                    p = p[1:]
                pieces.append(p.strip().split(kvsep))
                key_vals = [(p[0], " ".join(p[1:])) for p in pieces]

        quoted = dialect['quoted GFF2 values']
        for item in key_vals:
            # Easy if it follows spec
            if len(item) == 2:
                key, val = item

            # Only key provided?
            elif len(item) == 1:
                key = item[0]
                val = ''

            else:
                key = item[0]
                val = dialect['keyval separator'].join(item[1:])

            try:
                quals[key]
            except KeyError:
                quals[key] = []

            if quoted:
                if (len(val) > 0 and val[0] == '"' and val[-1] == '"'):
                    val = val[1:-1]

            if val:
                # TODO: if there are extra commas for a value, just use empty
                # strings
                # quals[key].extend([v for v in val.split(',') if v])
                vals = val.split(',')
                quals[key].extend(vals)

        return quals, dialect

    # If we got here, then we need to infer the dialect....
    #
    # Reset the order to an empty list so that it will only be populated with
    # keys that are found in the file.
    dialect['order'] = []

    # ensembl GTF has trailing semicolon
    if keyval_str[-1] == ';':
        keyval_str = keyval_str[:-1]
        dialect['trailing semicolon'] = True

    # GFF2/GTF has a semicolon with at least one space after it.
    # Spaces can be on both sides (e.g. wormbase)
    # GFF3 works with no spaces.
    # So split on the first one we can recognize...
    for sep in (' ; ', '; ', ';'):
        parts = keyval_str.split(sep)
        if len(parts) > 1:
            dialect['field separator'] = sep
            break

    # Is it GFF3?  They have key-vals separated by "="
    if gff3_kw_pat.match(parts[0]):
        key_vals = [p.split('=') for p in parts]
        dialect['fmt'] = 'gff3'
        dialect['keyval separator'] = '='

    # Otherwise, key-vals separated by space.  Key is first item.
    else:
        dialect['keyval separator'] = " "
        pieces = []
        for p in parts:
            # Fix misplaced semicolons in keys in some GFF2 files
            if p and p[0] == ';':
                p = p[1:]
                dialect['leading semicolon'] = True
            pieces.append(p.strip().split(' '))
        key_vals = [(p[0], " ".join(p[1:])) for p in pieces]

    for item in key_vals:

        # Easy if it follows spec
        if len(item) == 2:
            key, val = item

        # Only key provided?
        elif len(item) == 1:
                key = item[0]
                val = ''

        # Pathological cases where values of a key have within them the key-val
        # separator, e.g., 
        #  Alias=SGN-M1347;ID=T0028;Note=marker name(s): T0028 SGN-M1347 |identity=99.58|escore=2e-126
        else:
            key = item[0]
            val = dialect['keyval separator'].join(item[1:])

        # Is the key already in there?
        if key in quals:
            dialect['repeated keys'] = True
        else:
            quals[key] = []

        # Remove quotes in GFF2
        if len(val) > 0 and val[0] == '"' and val[-1] == '"':
            val = val[1:-1]
            dialect['quoted GFF2 values'] = True
        if val:
            # TODO: if there are extra commas for a value, just use empty
            # strings
            # quals[key].extend([v for v in val.split(',') if v])
            vals = val.split(',')
            if (len(vals) > 1) and dialect['repeated keys']:
                raise AttributeStringError(
                    "Internally inconsistent attributes formatting: "
                    "some have repeated keys, some do not.")
            quals[key].extend(vals)

        # keep track of the order of keys
        dialect['order'].append(key)

    #for key, vals in quals.items():
    #
        # TODO: urllib.unquote breaks round trip invariance for "hybrid1.gff3"
        # test file.  This is because the "Note" field has %xx escape chars,
        # but "Dbxref" has ":" which, if everything were consistent, should
        # have also been escaped.
        #
        # (By the way, GFF3 spec says only literal use of \t, \n, \r, %, and
        # control characters should be encoded)
        #
        # Solution 1: don't unquote
        # Solution 2: store, along with each attribute, whether or not it
        #             should be quoted later upon reconstruction
        # Solution 3: don't care about invariance

        # unquoted = [urllib.unquote(v) for v in vals]

        #quals[key] = vals

    if (
        (dialect['keyval separator'] == ' ') and
        (dialect['quoted GFF2 values'])
    ):
        dialect['fmt'] = 'gtf'

    return quals, dialect
Esempio n. 4
0
def test_empty_split_keyvals():
    attrs, dialect = parser._split_keyvals(keyval_str=None)
    assert attrs == feature.dict_class()
    assert dialect == constants.dialect