def cleanup_title(value): # Need to use this rather than .title() because .title() # does not handle things like "Wouldn't" properly. It # converts it to "Wouldn'T" rather than keeping the T # lowercase if value[0] == '"' or value[0] == "'": value = value[1:] if value[len(value) - 1] == '"' or value[len(value) - 1] == "'": value = value[:len(value) - 1] value = value.replace('"', "").strip() value = HTMLParser.HTMLParser().unescape(value.lower()) en_us_locale = icu.Locale('en_US') break_iter = icu.BreakIterator.createTitleInstance(en_us_locale) temp_title = icu.UnicodeString(value) title = unicode(temp_title.toTitle(break_iter, en_us_locale)) word_enders = [" ", ",", ".", ";", ":", '"', "'", "-"] for acronym in settings.COMPANY_ACRONYMS: if '.com' in acronym[0]: # .com often comes at the end of a title so we don't want to add # the trailing space check if acronym[1] in title: title = title.replace(acronym[1], acronym[0]) else: if title.rfind(acronym[1]) == len(title) - len(acronym[1]): title = "%s%s" % (title[:len(title) - (len(acronym[1]))], acronym[0]) for ender in word_enders: if "%s%s" % (acronym[1], ender) in title: title = title.replace("%s%s" % (acronym[1], ender), "%s%s" % (acronym[0], ender)) return title
def _normalize(bs: bistr, normalizer: icu.Normalizer2) -> bistr: builder = BistrBuilder(bs) us = icu.UnicodeString(bs.modified) offset = 0 while not builder.is_complete: i = normalizer.spanQuickCheckYes(us) builder.skip(us.countChar32(0, i)) if builder.is_complete: break us = us[i:] i = 0 while i < len(us): if us.charAt(i) & 0xFC00 == 0xD800: i += 1 i += 1 if normalizer.hasBoundaryBefore(chr(us.char32At(i))): break chunk = us[:i] normalized = str(normalizer.normalize(chunk)) builder.replace(chunk.countChar32(), normalized) us = us[i:] return builder.build()
def __init__(self): # Graph where the nodes are unicode characters and the edges are "contains" # such that successors(尔) = [...你...]., and predecessors(你) = [亻,尔]. # So, insert with self._graph.add_edge( "亻", "你" ) # self._graph.add_edge( "尔", "你" ) self._graph = nx.DiGraph() with open(babelstone.PATH_TO_IDS_TXT, encoding="UTF-8") as fp: for line in fp: # Ignore comments if line.startswith("#"): continue # TODO(ambuc): ids.txt uses: # {1}, {2}, etc. to represent unencoded components. # ↔ as a mirror operator, i.e. to represent a component without # a Unicode encoding, but whose mirror does have a Unicode # encoding. # ↷ as a rotation operator, i.e. to represent a component # without a Unicode encoding, but whose 180deg rotation does # have a Unicode encoding. # 〾 as a variation indicator. We should try to handle these. # ?, ? ids.txt uses these to represent an unencodable component. # We should probably try to handle these edge cases. elif re.search("[{}↔↷〾??]", line): continue maybe_parsed_set = parse(str(icu.UnicodeString(line))) if maybe_parsed_set is not None: self.insert(maybe_parsed_set)
def get_section_title(ch): nkfd_form = unicodedata.normalize('NFKD', unicode(ch)) nkfd_ch = nkfd_form[0] cat = unicodedata.category(nkfd_ch) if 'L' != cat[0]: # Not a letter return '' if 'l' != cat[1]: # Not a lower-case letter (uppercase or special) return nkfd_ch return unicode(icu.UnicodeString(nkfd_ch).toUpper(lang_locale))
def _edit(bs: bistr, op: Callable, locale: Optional[str] = None) -> bistr: builder = BistrBuilder(bs) edits = icu.Edits() ucur = icu.UnicodeString(builder.current) if locale is None: umod = icu.UnicodeString(op(ucur, edits)) else: umod = icu.UnicodeString(op(icu.Locale(locale), ucur, edits)) for is_change, old_len, new_len, old_i, new_i, _ in edits.getFineIterator(): old_len = ucur.countChar32(old_i, old_len) if is_change: repl = str(umod[new_i:new_i+new_len]) builder.replace(old_len, repl) else: builder.skip(old_len) return builder.build()
def tokenize(self, text: String) -> Tokenization: text = bistr(text) tokens = [] bi = self._break_iterator() utext = icu.UnicodeString(text.modified) bi.setText(utext) ui = bi.first() uj = bi.nextBoundary() i = 0 while uj != icu.BreakIterator.DONE: j = i + utext.countChar32(ui, uj - ui) if self._check_token(bi.getRuleStatus()): tokens.append(Token.slice(text, i, j)) ui = uj uj = bi.nextBoundary() i = j return Tokenization(text, tokens)
regex_doublequotes = re.compile(r'\"+'); """ To detect Multiword phrases inside a sentence we are using Gensim Library and it's models.phrases class https://radimrehurek.com/gensim/models/phrases.html. We have already made an object of the class and put all the tokenized reviews in it and stored it in gensim_mulitphrases class (see generate_pickle_for_Gensim.py). Now we are loading object back from pickle file """ with open("gensim_mulitphrases_"+Business+".txt", "rb") as fp: bigram = pickle.load(fp) """ The py2casefold in not tested enough, slow, and I couldn't install it in anaconda. This lambda function is a replacement. The function returns a string, not a unicode, because we will use the .translate fast function to remove punct. of strings. Source: https://stackoverflow.com/a/32838944/3429115 """ casefold = lambda u: unicode(icu.UnicodeString(u).foldCase()).encode(CharsSet,"replace"); CharsSet = "ascii"; # The Character set to be used as the default one when interpreting texts def getWordnetPos(_treebank_tag): """ Translate the tree bank PoS tags to the WordNet's > Parameters: _treebank_tag : str | The tag to be translated > Returns: The relevant WordNet PoS tag https://stackoverflow.com/a/15590384/3429115 """ if _treebank_tag.startswith('J'): return wordnet.ADJ
spec = "".join(file(opts.input).readlines()) brk = icu.RuleBasedBreakIterator(spec) else: brk = icu.RuleBasedBreakIterator() print(brk.getRules()) if opts.codes: text = "".join(chr(int(x, 16)) for x in args) elif opts.file: text = "".join(file(args[0]).readlines()) else: text = args[0] res = [] brk.setText(icu.UnicodeString(text)) last = brk.first() try: while True: next = brk.next() # print(next, " ", brk.getRuleStatus()) res.append(text[last:next]) last = next except: res.append(text[last:]) if opts.hex: print(f" {opts.separator} ".join(" ".join(hex(ord(x)) for x in res) for y in res)) else: print(opts.separator.join(res))
import random # The path to the main output folder, in which the text files are placed, and checks are made to avoid duplicated work: directory = "./RestaurantsToOthers_Data" regex_slash_newlines = re.compile(r'[\n\r\\]+') regex_slash_tabs = re.compile(r'\t+') regex_doublequotes = re.compile(r'\"+') """ The py2casefold in not tested enough, slow, and I couldn't install it in anaconda. This lambda function is a replacement. The function returns a string, not a unicode, because we will use the .translate fast function to remove punct. of strings. Source: https://stackoverflow.com/a/32838944/3429115 """ CharsSet = "ascii" # The Character set to be used as the default one when interpreting texts casefold = lambda u: unicode(icu.UnicodeString(u).foldCase()).encode( CharsSet, "ignore") def iter_sample_fast(iterable, samplesize): """ Fast memory-efficient sampling method for pretty large iterables. Adopted from: https://stackoverflow.com/a/12583436/3429115 > Parameters: * iterable: iterable object | The collection we want to sample from * samplesize: int | How many samples to generate > Returns: List of samples, hoewver, since sampling is made without replacement (most probably), they aren't IID; but