def get_unicode_properties(): props_file = open(LOCAL_PROPS_FILE) props = defaultdict(list) for line in parse_file(props_file): char_range, prop = line char_range = parse_char_range(char_range) if len(char_range) == 2: for i in xrange(char_range[0], char_range[1] + 1): props[prop.lower()].append(wide_unichr(i)) elif char_range: props[prop.lower()].append(wide_unichr(char_range[0])) derived_props_file = open(LOCAL_DERIVED_CORE_PROPS_FILE) for line in parse_file(derived_props_file): char_range, prop = line char_range = parse_char_range(char_range) if len(char_range) == 2: for i in xrange(char_range[0], char_range[1] + 1): props[prop.lower()].append(wide_unichr(i)) elif char_range: props[prop.lower()].append(wide_unichr(char_range[0])) return dict(props)
def get_unicode_blocks(): blocks_file = open(LOCAL_BLOCKS_FILE) blocks = defaultdict(list) for line in parse_file(blocks_file): char_range, block = line char_range = parse_char_range(char_range) if len(char_range) == 2: for i in xrange(char_range[0], char_range[1] + 1): blocks[block.lower()].append(wide_unichr(i)) elif char_range: blocks[block.lower()].append(wide_unichr(char_range[0])) return dict(blocks)
def get_word_break_properties(): props_file = open(LOCAL_WORD_BREAKS_FILE) props = defaultdict(list) for line in parse_file(props_file): char_range, prop = line char_range = parse_char_range(char_range) if len(char_range) == 2: for i in xrange(char_range[0], char_range[1] + 1): props[prop].append(wide_unichr(i)) elif char_range: props[prop].append(wide_unichr(char_range[0])) return dict(props)
def get_unicode_combining_classes(): """ Build dict of unicode combining classes e.g. { '0': ['\x00', '\x01', \x02', ...] } """ combining_classes = defaultdict(list) for row in parse_unicode_data(): combining_classes[int(row.combining)].append(wide_unichr(unicode_to_integer(row.code))) return dict(combining_classes)
def get_unicode_categories(): """ Build dict of unicode categories e.g. { 'Lu': ['A', 'B', 'C', ...] 'Ll': ['a', 'b', 'c', ...] } """ categories = defaultdict(list) for row in parse_unicode_data(): categories[row.category].append(wide_unichr(unicode_to_integer(row.code))) return dict(categories)
def get_unicode_combining_classes(): ''' Build dict of unicode combining classes e.g. { '0': ['\x00', '\x01', \x02', ...] } ''' combining_classes = defaultdict(list) for row in parse_unicode_data(): combining_classes[int(row.combining)].append( wide_unichr(unicode_to_integer(row.code))) return dict(combining_classes)
def get_unicode_categories(): ''' Build dict of unicode categories e.g. { 'Lu': ['A', 'B', 'C', ...] 'Ll': ['a', 'b', 'c', ...] } ''' categories = defaultdict(list) for row in parse_unicode_data(): categories[row.category].append( wide_unichr(unicode_to_integer(row.code))) return dict(categories)
def init_unicode_categories(): """ Initialize module-level dictionaries """ global unicode_categories, unicode_general_categories, unicode_scripts, unicode_category_aliases global unicode_blocks, unicode_combining_classes, unicode_properties, unicode_property_aliases global unicode_property_value_aliases, unicode_scripts, unicode_script_ids, unicode_word_breaks unicode_categories.update(get_unicode_categories()) unicode_combining_classes.update(get_unicode_combining_classes()) for key in unicode_categories.keys(): unicode_general_categories[key[0]].extend(unicode_categories[key]) script_chars = get_chars_by_script() for i, script in enumerate(script_chars): if script: unicode_scripts[script.lower()].append(wide_unichr(i)) unicode_scripts = dict(unicode_scripts) unicode_script_ids.update(build_master_scripts_list(script_chars)) unicode_blocks.update(get_unicode_blocks()) unicode_properties.update(get_unicode_properties()) unicode_property_aliases.update(get_property_aliases()) unicode_word_breaks.update(get_word_break_properties()) for key, value in get_property_value_aliases().iteritems(): key = unicode_property_aliases.get(key, key) if key == GENERAL_CATEGORY_PROP: for k, v in value.iteritems(): k = k.lower() unicode_category_aliases[k] = v if "_" in k: unicode_category_aliases[k.replace("_", "")] = v unicode_property_value_aliases[key] = value
def init_unicode_categories(): ''' Initialize module-level dictionaries ''' global unicode_categories, unicode_general_categories, unicode_scripts, unicode_category_aliases global unicode_blocks, unicode_combining_classes, unicode_properties, unicode_property_aliases global unicode_property_value_aliases, unicode_scripts, unicode_script_ids, unicode_word_breaks unicode_categories.update(get_unicode_categories()) unicode_combining_classes.update(get_unicode_combining_classes()) for key in unicode_categories.keys(): unicode_general_categories[key[0]].extend(unicode_categories[key]) script_chars = get_chars_by_script() for i, script in enumerate(script_chars): if script: unicode_scripts[script.lower()].append(wide_unichr(i)) unicode_scripts = dict(unicode_scripts) unicode_script_ids.update(build_master_scripts_list(script_chars)) unicode_blocks.update(get_unicode_blocks()) unicode_properties.update(get_unicode_properties()) unicode_property_aliases.update(get_property_aliases()) unicode_word_breaks.update(get_word_break_properties()) for key, value in get_property_value_aliases().iteritems(): key = unicode_property_aliases.get(key, key) if key == GENERAL_CATEGORY_PROP: for k, v in value.iteritems(): k = k.lower() unicode_category_aliases[k] = v if '_' in k: unicode_category_aliases[k.replace('_', '')] = v unicode_property_value_aliases[key] = value
def format_regex_char(i): c = wide_unichr(i) return replace_regex_chars(c.encode("unicode-escape"))
def format_regex_char(i): c = wide_unichr(i) return replace_regex_chars(c.encode('unicode-escape'))