def cleanup_title(value): # Need to use this rather than .title() because .title() # does not handle things like "Wouldn't" properly. It # converts it to "Wouldn'T" rather than keeping the T # lowercase if value[0] == '"' or value[0] == "'": value = value[1:] if value[len(value) - 1] == '"' or value[len(value) - 1] == "'": value = value[:len(value) - 1] value = value.replace('"', "").strip() value = HTMLParser.HTMLParser().unescape(value.lower()) en_us_locale = icu.Locale('en_US') break_iter = icu.BreakIterator.createTitleInstance(en_us_locale) temp_title = icu.UnicodeString(value) title = unicode(temp_title.toTitle(break_iter, en_us_locale)) word_enders = [" ", ",", ".", ";", ":", '"', "'", "-"] for acronym in settings.COMPANY_ACRONYMS: if '.com' in acronym[0]: # .com often comes at the end of a title so we don't want to add # the trailing space check if acronym[1] in title: title = title.replace(acronym[1], acronym[0]) else: if title.rfind(acronym[1]) == len(title) - len(acronym[1]): title = "%s%s" % (title[:len(title) - (len(acronym[1]))], acronym[0]) for ender in word_enders: if "%s%s" % (acronym[1], ender) in title: title = title.replace("%s%s" % (acronym[1], ender), "%s%s" % (acronym[0], ender)) return title
def _normalize(bs: bistr, normalizer: icu.Normalizer2) -> bistr: builder = BistrBuilder(bs) us = icu.UnicodeString(bs.modified) offset = 0 while not builder.is_complete: i = normalizer.spanQuickCheckYes(us) builder.skip(us.countChar32(0, i)) if builder.is_complete: break us = us[i:] i = 0 while i < len(us): if us.charAt(i) & 0xFC00 == 0xD800: i += 1 i += 1 if normalizer.hasBoundaryBefore(chr(us.char32At(i))): break chunk = us[:i] normalized = str(normalizer.normalize(chunk)) builder.replace(chunk.countChar32(), normalized) us = us[i:] return
def __init__(self): # Graph where the nodes are unicode characters and the edges are "contains" # such that successors(尔) = [...你...]., and predecessors(你) = [亻,尔]. # So, insert with self._graph.add_edge( "亻", "你" ) # self._graph.add_edge( "尔", "你" ) self._graph = nx.DiGraph() with open(babelstone.PATH_TO_IDS_TXT, encoding="UTF-8") as fp: for line in fp: # Ignore comments if line.startswith("#"): continue # TODO(ambuc): ids.txt uses: # {1}, {2}, etc. to represent unencoded components. # ↔ as a mirror operator, i.e. to represent a component without # a Unicode encoding, but whose mirror does have a Unicode # encoding. # ↷ as a rotation operator, i.e. to represent a component # without a Unicode encoding, but whose 180deg rotation does # have a Unicode encoding. # 〾 as a variation indicator. We should try to handle these. # ?, ? ids.txt uses these to represent an unencodable component. # We should probably try to handle these edge cases. elif"[{}↔↷〾??]", line): continue maybe_parsed_set = parse(str(icu.UnicodeString(line))) if maybe_parsed_set is not None: self.insert(maybe_parsed_set)
def get_section_title(ch): nkfd_form = unicodedata.normalize('NFKD', unicode(ch)) nkfd_ch = nkfd_form[0] cat = unicodedata.category(nkfd_ch) if 'L' != cat[0]: # Not a letter return '' if 'l' != cat[1]: # Not a lower-case letter (uppercase or special) return nkfd_ch return unicode(icu.UnicodeString(nkfd_ch).toUpper(lang_locale))
def _edit(bs: bistr, op: Callable, locale: Optional[str] = None) -> bistr: builder = BistrBuilder(bs) edits = icu.Edits() ucur = icu.UnicodeString(builder.current) if locale is None: umod = icu.UnicodeString(op(ucur, edits)) else: umod = icu.UnicodeString(op(icu.Locale(locale), ucur, edits)) for is_change, old_len, new_len, old_i, new_i, _ in edits.getFineIterator(): old_len = ucur.countChar32(old_i, old_len) if is_change: repl = str(umod[new_i:new_i+new_len]) builder.replace(old_len, repl) else: builder.skip(old_len) return
def tokenize(self, text: String) -> Tokenization: text = bistr(text) tokens = [] bi = self._break_iterator() utext = icu.UnicodeString(text.modified) bi.setText(utext) ui = bi.first() uj = bi.nextBoundary() i = 0 while uj != icu.BreakIterator.DONE: j = i + utext.countChar32(ui, uj - ui) if self._check_token(bi.getRuleStatus()): tokens.append(Token.slice(text, i, j)) ui = uj uj = bi.nextBoundary() i = j return Tokenization(text, tokens)
regex_doublequotes = re.compile(r'\"+'); """ To detect Multiword phrases inside a sentence we are using Gensim Library and it's models.phrases class We have already made an object of the class and put all the tokenized reviews in it and stored it in gensim_mulitphrases class (see Now we are loading object back from pickle file """ with open("gensim_mulitphrases_"+Business+".txt", "rb") as fp: bigram = pickle.load(fp) """ The py2casefold in not tested enough, slow, and I couldn't install it in anaconda. This lambda function is a replacement. The function returns a string, not a unicode, because we will use the .translate fast function to remove punct. of strings. Source: """ casefold = lambda u: unicode(icu.UnicodeString(u).foldCase()).encode(CharsSet,"replace"); CharsSet = "ascii"; # The Character set to be used as the default one when interpreting texts def getWordnetPos(_treebank_tag): """ Translate the tree bank PoS tags to the WordNet's > Parameters: _treebank_tag : str | The tag to be translated > Returns: The relevant WordNet PoS tag """ if _treebank_tag.startswith('J'): return wordnet.ADJ
spec = "".join(file(opts.input).readlines()) brk = icu.RuleBasedBreakIterator(spec) else: brk = icu.RuleBasedBreakIterator() print(brk.getRules()) if text = "".join(chr(int(x, 16)) for x in args) elif opts.file: text = "".join(file(args[0]).readlines()) else: text = args[0] res = [] brk.setText(icu.UnicodeString(text)) last = brk.first() try: while True: next = # print(next, " ", brk.getRuleStatus()) res.append(text[last:next]) last = next except: res.append(text[last:]) if opts.hex: print(f" {opts.separator} ".join(" ".join(hex(ord(x)) for x in res) for y in res)) else: print(opts.separator.join(res))
import random # The path to the main output folder, in which the text files are placed, and checks are made to avoid duplicated work: directory = "./RestaurantsToOthers_Data" regex_slash_newlines = re.compile(r'[\n\r\\]+') regex_slash_tabs = re.compile(r'\t+') regex_doublequotes = re.compile(r'\"+') """ The py2casefold in not tested enough, slow, and I couldn't install it in anaconda. This lambda function is a replacement. The function returns a string, not a unicode, because we will use the .translate fast function to remove punct. of strings. Source: """ CharsSet = "ascii" # The Character set to be used as the default one when interpreting texts casefold = lambda u: unicode(icu.UnicodeString(u).foldCase()).encode( CharsSet, "ignore") def iter_sample_fast(iterable, samplesize): """ Fast memory-efficient sampling method for pretty large iterables. Adopted from: > Parameters: * iterable: iterable object | The collection we want to sample from * samplesize: int | How many samples to generate > Returns: List of samples, hoewver, since sampling is made without replacement (most probably), they aren't IID; but