Example #1
def cleanup_title(value):
    # Need to use this rather than .title() because .title()
    # does not handle things like "Wouldn't" properly. It
    # converts it to "Wouldn'T" rather than keeping the T
    # lowercase
    if value[0] == '"' or value[0] == "'":
        value = value[1:]
    if value[len(value) - 1] == '"' or value[len(value) - 1] == "'":
        value = value[:len(value) - 1]
    value = value.replace('"', "").strip()
    value = HTMLParser.HTMLParser().unescape(value.lower())
    en_us_locale = icu.Locale('en_US')
    break_iter = icu.BreakIterator.createTitleInstance(en_us_locale)
    temp_title = icu.UnicodeString(value)
    title = unicode(temp_title.toTitle(break_iter, en_us_locale))
    word_enders = [" ", ",", ".", ";", ":", '"', "'", "-"]
    for acronym in settings.COMPANY_ACRONYMS:
        if '.com' in acronym[0]:
            # .com often comes at the end of a title so we don't want to add
            # the trailing space check
            if acronym[1] in title:
                title = title.replace(acronym[1], acronym[0])
            if title.rfind(acronym[1]) == len(title) - len(acronym[1]):
                title = "%s%s" % (title[:len(title) -
                                        (len(acronym[1]))], acronym[0])
            for ender in word_enders:
                if "%s%s" % (acronym[1], ender) in title:
                    title = title.replace("%s%s" % (acronym[1], ender),
                                          "%s%s" % (acronym[0], ender))
    return title
Example #2
def _normalize(bs: bistr, normalizer: icu.Normalizer2) -> bistr:
    builder = BistrBuilder(bs)
    us = icu.UnicodeString(bs.modified)
    offset = 0
    while not builder.is_complete:
        i = normalizer.spanQuickCheckYes(us)
        builder.skip(us.countChar32(0, i))
        if builder.is_complete:
        us = us[i:]

        i = 0
        while i < len(us):
            if us.charAt(i) & 0xFC00 == 0xD800:
                i += 1
            i += 1
            if normalizer.hasBoundaryBefore(chr(us.char32At(i))):

        chunk = us[:i]
        normalized = str(normalizer.normalize(chunk))
        builder.replace(chunk.countChar32(), normalized)
        us = us[i:]

    return builder.build()
Example #3
    def __init__(self):

        # Graph where the nodes are unicode characters and the edges are "contains"
        # such that successors(尔) = [...你...]., and predecessors(你) = [亻,尔].
        # So, insert with self._graph.add_edge( "亻", "你" )
        #                 self._graph.add_edge( "尔", "你" )
        self._graph = nx.DiGraph()

        with open(babelstone.PATH_TO_IDS_TXT, encoding="UTF-8") as fp:
            for line in fp:
                # Ignore comments
                if line.startswith("#"):
                # TODO(ambuc): ids.txt uses:
                # {1}, {2}, etc. to represent unencoded components.
                # ↔         as a mirror operator, i.e. to represent a component without
                #           a Unicode encoding, but whose mirror does have a Unicode
                #           encoding.
                # ↷        as a rotation operator, i.e. to represent a component
                #           without a Unicode encoding, but whose 180deg rotation does
                #           have a Unicode encoding.
                # 〾        as a variation indicator. We should try to handle these.
                # ?, ?     ids.txt uses these to represent an unencodable component.
                # We should probably try to handle these edge cases.
                elif re.search("[{}↔↷〾??]", line):

                maybe_parsed_set = parse(str(icu.UnicodeString(line)))
                if maybe_parsed_set is not None:
Example #4
def get_section_title(ch):
    nkfd_form = unicodedata.normalize('NFKD', unicode(ch))
    nkfd_ch = nkfd_form[0]
    cat = unicodedata.category(nkfd_ch)
    if 'L' != cat[0]:  # Not a letter
        return ''
    if 'l' != cat[1]:  # Not a lower-case letter (uppercase or special)
        return nkfd_ch
    return unicode(icu.UnicodeString(nkfd_ch).toUpper(lang_locale))
Example #5
def _edit(bs: bistr, op: Callable, locale: Optional[str] = None) -> bistr:
    builder = BistrBuilder(bs)
    edits = icu.Edits()
    ucur = icu.UnicodeString(builder.current)

    if locale is None:
        umod = icu.UnicodeString(op(ucur, edits))
        umod = icu.UnicodeString(op(icu.Locale(locale), ucur, edits))

    for is_change, old_len, new_len, old_i, new_i, _ in edits.getFineIterator():
        old_len = ucur.countChar32(old_i, old_len)
        if is_change:
            repl = str(umod[new_i:new_i+new_len])
            builder.replace(old_len, repl)

    return builder.build()
Example #6
    def tokenize(self, text: String) -> Tokenization:
        text = bistr(text)
        tokens = []

        bi = self._break_iterator()

        utext = icu.UnicodeString(text.modified)

        ui = bi.first()
        uj = bi.nextBoundary()
        i = 0
        while uj != icu.BreakIterator.DONE:
            j = i + utext.countChar32(ui, uj - ui)
            if self._check_token(bi.getRuleStatus()):
                tokens.append(Token.slice(text, i, j))
            ui = uj
            uj = bi.nextBoundary()
            i = j

        return Tokenization(text, tokens)
Example #8
        spec = "".join(file(opts.input).readlines())
        brk = icu.RuleBasedBreakIterator(spec)
        brk = icu.RuleBasedBreakIterator()


    if opts.codes:
        text = "".join(chr(int(x, 16)) for x in args)
    elif opts.file:
        text = "".join(file(args[0]).readlines())
        text = args[0]

    res = []
    last = brk.first()
        while True:
            next = brk.next()
            #            print(next, " ", brk.getRuleStatus())
            last = next

    if opts.hex:
        print(f" {opts.separator} ".join(" ".join(hex(ord(x)) for x in res)
                                         for y in res))
