def search_results(self, originator): """Display the search results.""" query = self.search_input.text[len("Keyword: "):] query_cf = casefold(query) self.mode = 'search_results' self.set_header('SEARCH RESULTS FOR "{0}"'.format(query)) results = [i for i in self.passwords if query_cf in casefold(i)] self._clear_box() self.box.body.append(BackButton('BACK', self.load_dispatch, self.current, self)) self._make_password_buttons(results)
def test_unsupported(self): # This makes sure that full case folding (C+F) is done, not # simple or special case folding. # First some Turkish fun... # 0x130 folds to: # 0x0069 + 0x0307 with full case folding (F) # 0x0069 ("i") alone with the Turkish special case (T) # We expect full case folding... self.assertEqual(casefold("\u0130"), "i\u0307") # Now explicitly check a full vs simple fold... # 0x1F9B folds to: # 0x1F23 + 0x03B9 with full case folding (F) # 0x1F93 alone with simple case folding (S) self.assertEqual(casefold("\u1F9B"), "\u1F23\u03B9")
def test_unsupported(self): # This makes sure that full case folding (C+F) is done, not # simple or special case folding. # First some Turkish fun... # 0x130 folds to: # 0x0069 + 0x0307 with full case folding (F) # 0x0069 ("i") alone with the Turkish special case (T) # We expect full case folding... self.assertEqual(casefold(u"\u0130"), u"i\u0307") # Now explicitly check a full vs simple fold... # 0x1F9B folds to: # 0x1F23 + 0x03B9 with full case folding (F) # 0x1F93 alone with simple case folding (S) self.assertEqual(casefold(u"\u1F9B"), u"\u1F23\u03B9")
def standardize_string(my_string): """Alter a copy of a sting to a standardized format for comparison.""" # Removed this debug statement because it produces so much unneeded output. # fn = sys._getframe().f_code.co_name # wp.debug(fn, "START: %s" % my_string, wdb.SHOW_START) str_lower = casefold(my_string.strip().decode('utf-8')).strip() return str_lower.replace('_', ' ').strip()
def transform_text(parseText, punctuations, casefolding): newcontent = '' if casefolding: parseText = casefold(parseText.decode('utf-8')) if punctuations: for letter in parseText: if letter.isalnum() or letter == '-' or letter == ' ': newcontent += letter return newcontent
def parse(self, data): # case-fold handled data = casefold(data) # encode to utf-8 data = data.encode('utf-8') # punctation removed data = re.sub(r'\W+', ' ', data) # lowercase return data.lower().strip()
def _normalize(cls, value): """ Normalize a search value. @type value: L{unicode} @param value: The value to normalize. @rtype: L{unicode} @return: The normalized value. """ return cls._searchNoise.sub(u'', casefold(normalize('NFC', value)))
def text_preprocess(review): """ Takes in a string of text, then performs the following: 1. Casefold the string 2. Remove all stopwords 3. stem the words 4. Returns a list of the cleaned text """ casefolded = casefold(unicode(review, 'utf-8')) tokenize = word_tokenize(casefolded) no_stopwards = [item for item in tokenize if item not in stop] stemmed = [stemmer.stem(y) for y in no_stopwards] return stemmed
def transform_for_lm(string): new_string = '' for index, char in enumerate(string): if char == ' ': new_string = new_string + u'}' elif (char == '.' and len(string) == (index+1)) or \ (char == '.' and string[index+1] == ' '): # treat period as end of sentence new_string = new_string + u'</s>' else: new_string = new_string + char.lower() if len(string) > (index + 1): new_string = new_string + u' ' new_string = casefold(new_string) return new_string
def _apply(self, value): value = self._filter(value, Type(text_type)) # type: Text if self._has_errors: return None # In Python 3, case folding is supported natively. # In Python 2, this is the best we can do. # https://docs.python.org/3/library/stdtypes.html#str.casefold if PY3: # noinspection PyUnresolvedReferences return value.casefold() else: # noinspection PyUnresolvedReferences from py2casefold import casefold return casefold(value)
def create_vocab_for_lm(pickle_file_1, pickle_file_2=None): outputfile_name = "all_chars.txt" df_1 = pickle.load(open(pickle_file_1, "rb")) context_1 = df_1['Error Context'].str.replace(" ", "}") # convert spaces to } all_chars = set(list(' '.join(list(context_1.values)))) if pickle_file_2: df_2 = pickle.load(open(pickle_file_2, "rb")) context_2 = df_2['Error Context'].str.replace( " ", "}") # convert spaces to } chars_2 = set(list(casefold(' '.join(list( context_2.values))))) # casefold all_chars = chars_2.union(all_chars) with codecs.open(outputfile_name, 'w', 'utf-8') as outfile: for char in all_chars: outfile.write(char + '\n')
def casefold_file(file_name): casefolded_file_name = file_name + '_casefolded' with codecs.open(file_name, 'r', 'utf-8') as infile: with codecs.open(casefolded_file_name, 'w', 'utf-8') as outfile: for line in infile: outfile.write(casefold(line))
def test_basic(self): self.assertEqual(casefold(u"tschüß"), u"tschüss") self.assertEqual(casefold(u"ΣίσυφοςfiÆ"), casefold(u"ΣΊΣΥΦΟσFIæ")) self.assertEqual(casefold(u"ffi"), u"ffi")
def test_ascii_only(self): self.assertEqual(casefold(u"aA"), u"aa") self.assertEqual(casefold(u"fOo Bar"), u"foo bar")
def test_empty_string(self): self.assertEqual(casefold(u""), u"")
def test_empty_string(self): self.assertEqual(casefold(""), "")
def normalize_caseless(text): return unicodedata.normalize("NFKD", casefold(text))
def test_ascii_only(self): self.assertEqual(casefold("aA"), "aa") self.assertEqual(casefold("fOo Bar"), "foo bar")
def normalize_caseless(text): return unicodedata.normalize("NFC", casefold(text))
def test_basic(self): self.assertEqual(casefold("tschüß"), "tschüss") self.assertEqual(casefold("ΣίσυφοςfiÆ"), casefold("ΣΊΣΥΦΟσFIæ")) self.assertEqual(casefold("ffi"), "ffi")
def caseFolding(data): for i in range(len(data)): data[i] = casefold(data[i]) return data
def username_eq(self, u1, u2): """ Force each username to lowercase and compare them. Can be overridden if the service has case sensitive usernames. """ return casefold(u1) == casefold(u2)