def test_expand_kXHC1983(expanded_data, ucn, fieldval, expected):
    r"""
    Each pīnyīn reading is preceded by the character’s location(s) in the
    dictionary, separated from the reading by “:” (colon); multiple locations
    for a given reading are separated by “,” (comma); multiple “location:
    reading” values are separated by “ ” (space). Each location reference is of
    the form /[0-9]{4}\.[0-9]{3}\*?/ . The number preceding the period is the
    page number, zero-padded to four digits. The first two digits of the number
    following the period are the entry’s position on the page, zero-padded. The
    third digit is 0 for a main entry and greater than 0 for a parenthesized
    variant of the main entry. A trailing “*” (asterisk) on the location
    indicates an encoded variant substituted for an unencoded character (see
    below).

    As of the present writing (Unicode 5.1), the XHC source data contains 204
    unencoded characters (198 of which were represented by PUA or CJK
    Compatibility [or in one case, by non-CJK, see below] characters), for the
    most part simplified variants. Each of these 198 characters in the source
    is replaced by one or more encoded variants (references in all 204 cases
    are marked with a trailing “*”; see above). Many of these unencoded forms
    are already in the pipeline for future encoding, and future revisions of
    this data will eliminate trailing asterisks from mappings.
    """
    item = [i for i in expanded_data if i['ucn'] == ucn][0]
    assert item['kXHC1983'] == expected

    assert expansion.expand_field('kXHC1983', fieldval) == expected
Exemple #2
0
def test_expand_kXHC1983(expanded_data, ucn, fieldval, expected):
    """
    Each pīnyīn reading is preceded by the character’s location(s) in the
    dictionary, separated from the reading by “:” (colon); multiple locations
    for a given reading are separated by “,” (comma); multiple “location:
    reading” values are separated by “ ” (space). Each location reference is of
    the form /[0-9]{4}\.[0-9]{3}\*?/ . The number preceding the period is the
    page number, zero-padded to four digits. The first two digits of the number
    following the period are the entry’s position on the page, zero-padded. The
    third digit is 0 for a main entry and greater than 0 for a parenthesized
    variant of the main entry. A trailing “*” (asterisk) on the location
    indicates an encoded variant substituted for an unencoded character (see
    below).

    As of the present writing (Unicode 5.1), the XHC source data contains 204
    unencoded characters (198 of which were represented by PUA or CJK
    Compatibility [or in one case, by non-CJK, see below] characters), for the
    most part simplified variants. Each of these 198 characters in the source
    is replaced by one or more encoded variants (references in all 204 cases
    are marked with a trailing “*”; see above). Many of these unencoded forms
    are already in the pipeline for future encoding, and future revisions of
    this data will eliminate trailing asterisks from mappings.
    """
    item = [i for i in expanded_data if i['ucn'] == ucn][0]
    assert item['kXHC1983'] == expected

    assert expansion.expand_field('kXHC1983', fieldval) == expected
Exemple #3
0
def expand_delimiters(normalized_data):
    """Return expanded multi-value fields in UNIHAN.

    :param normalized: Expects data in list of hashes, per
        :meth:`process.normalize`
    :type normalized: list of dict
    :returns: Items which have fields with delimiters and custom separation
        rules, will be expanded. Including multi-value fields not using both
        fields (so all fields stay consistent).
    :rtype: list of dict
    """
    for char in normalized_data:
        for field in char.keys():
            if not char[field]:
                continue
            char[field] = expansion.expand_field(field, char[field])

    return normalized_data
Exemple #4
0
def expand_delimiters(normalized_data):
    """
    Return expanded multi-value fields in UNIHAN.

    Parameters
    ----------
    normalized_data : list of dict
        Expects data in list of hashes, per :meth:`process.normalize`

    Returns
    -------
    list of dict :
        Items which have fields with delimiters and custom separation rules,
        will  be expanded. Including multi-value fields not using both fields
        (so all fields stay consistent).
    """
    for char in normalized_data:
        for field in char.keys():
            if not char[field]:
                continue
            char[field] = expansion.expand_field(field, char[field])

    return normalized_data
def test_expand_kCCCII(expanded_data, ucn, fieldval, expected):
    item = [i for i in expanded_data if i['ucn'] == ucn][0]
    assert item['kCCCII'] == expected

    assert expansion.expand_field('kCCCII', fieldval) == expected
def test_expand_kIRG_KPSource(expanded_data, field, ucn, fieldval, expected):
    item = [i for i in expanded_data if i['ucn'] == ucn][0]
    assert item[field] == expected

    assert expansion.expand_field(field, fieldval) == expected
Exemple #7
0
def test_expand_kIRG_KPSource(expanded_data, field, ucn, fieldval, expected):
    item = [i for i in expanded_data if i['ucn'] == ucn][0]
    assert item[field] == expected

    assert expansion.expand_field(field, fieldval) == expected
Exemple #8
0
def test_expand_kCCCII(expanded_data, ucn, fieldval, expected):
    item = [i for i in expanded_data if i['ucn'] == ucn][0]
    assert item['kCCCII'] == expected

    assert expansion.expand_field('kCCCII', fieldval) == expected