def slugify(*args, **kwargs): """Join a series of strings into a URL slug. - normalizes strings to proper ascii repesentations - removes non-alphanumeric characters - replaces whitespace with dashes :param lower: Whether the slug should be all-lowercase :param maxlen: Maximum slug length :param fallback: Fallback in case of an empty slug """ lower = kwargs.get('lower', True) maxlen = kwargs.get('maxlen') fallback = kwargs.get('fallback', '') value = '-'.join(str(val) for val in args) value = translitcodec.long_encode(value)[0] value = re.sub(r'[^\w\s-]', '', value, flags=re.ASCII).strip() if lower: value = value.lower() value = re.sub(r'[-\s]+', '-', value) if maxlen: value = value[0:maxlen].rstrip('-') return value or fallback
def slugify(text, delim=u'-'): """Generates an ASCII-only slug.""" result = [] for word in _punct_re.split(text.lower()): word = translitcodec.long_encode(word)[0] if word: result.append(word) return unicode(delim.join(result))
def stem_document(self, doc, re_fit): """ Stem the documents, and prepare the stemmer for the inverse_transform by keeping track of a mapping back to the original expressions and their counts. :param doc: document string :param re_fit: boolean, if True, it will prepare the stemmer for the inverse_transform by saving state. :return: stemmed document string """ # Ignore punctuation and split on spaces. for punctuation_character in punctuation: doc = doc.replace( # punctuation_character, " {} ".format(punctuation_character) punctuation_character, " ".format(punctuation_character)) doc = doc.replace(" ", " ").replace(" ", " ").strip() # words_or_punct = doc.split(" ") # stemmer = st.Stemmer(self.language) # stemmed_words = stemmer.stemWords(words_or_punct) # Stemmed words won't have accents nor capital letters anymore. transformed_words = [ translitcodec.long_encode(w)[0].lower() for w in doc.split(" ") ] words = doc.split(" ") stemmer = st.Stemmer(self.language) stemmed_words = stemmer.stemWords(transformed_words) if re_fit: # Keep track of things for inverse stemming: each word has its count. # But the inverse relationship is not deterministic: we need to count occurences # because we need the TOP equivalent word back. for (_word, _stemmed_word) in zip(words, stemmed_words): if _stemmed_word in self.stemmed_word_to_equiv_word_count: if _word in self.stemmed_word_to_equiv_word_count[ _stemmed_word]: count_yet = self.stemmed_word_to_equiv_word_count[ _stemmed_word][_word] self.stemmed_word_to_equiv_word_count[_stemmed_word][ _word] = count_yet + 1 # += 1 else: self.stemmed_word_to_equiv_word_count[_stemmed_word][ _word] = 1 else: self.stemmed_word_to_equiv_word_count[_stemmed_word] = { _word: 1 } else: stemmed_document = " ".join(stemmed_words) return stemmed_document
def remove_from_string(self, text): """ Remove stopwords from a string in the safest possible way to keep the text intact. """ # In the following variables, text's characters will flow from bottom to top such as: # text --> last_word|last_punct --> past_text past_text = "" last_punct = "" last_word = "" text += "." # add a last punctuation to loop 1 last time closing the sentence. for char in text: decoded_char = translitcodec.short_encode(char)[0].lower() char_is_letter = False if decoded_char in string.ascii_lowercase: # Lowercase alphabet char_is_letter = True # We loop if it's part of a word. if char_is_letter: # We're building a word. # Loop last_word += char # Otherwise if it's punctuation, we're either somehow before or directly after a word. elif not char_is_letter: # We ignore N punctuations in a row before a word. if last_word == "": # Move on. last_punct += char # Otherwise we're closing a word. Let's process it now. else: full_word = last_word safe_full_word = translitcodec.long_encode( full_word)[0].lower() if safe_full_word in self.safe_stopwords: # We remove the word (and the following apostrophe or space if there is one)! full_word = "" if char in "’'‘’'' ": char = "" # Loop past_text += last_punct + full_word last_punct = char last_word = "" past_text += last_punct return past_text[:-1]
def fit(self, X=None, y=None): """ This function is implemented for the class to be usable by scikit-learn's Pipeline() behavior. X & y are ignored here, but required by convention. It reads the stopwords from disk if there are none provided. """ if self.stopwords is None: current_dir = os.path.dirname(os.path.realpath(__file__)) stop_words_file = os.path.join(current_dir, "..", "data", STOPWORDS_FILENAME) with open(stop_words_file) as f: self.stopwords = f.read().split("\n") self.safe_stopwords = [ translitcodec.long_encode(w)[0].lower() for w in self.stopwords ] return self
def str_to_ascii(text): return translitcodec.long_encode(text)[0].encode( 'ascii', 'ignore').decode().strip()
#!/usr/local/bin/python3 import sys, os, unicodedata, timeit sys.path.append(os.getcwd() + '/lib/python3.8/site-packages') import ctranslitcodec, translitcodec, _ctranslitcodec x = '£ ☹ wøóf méåw ﷲ etsi vereor judices ne turpe sit pro fortissimo viro incipientem timere minimeque deceat' a = open('long.txt').read() print(a) assert '\u2639' in a print(translitcodec.long_encode(a)) print(ctranslitcodec.long_encode(unicodedata.normalize('NFKC', a))) assert ctranslitcodec.long_encode(unicodedata.normalize('NFKC', a))[0] \ == translitcodec.long_encode(a)[0] # validate UTF-8 encoding for i in range(1, 0x10FFFF): try: c = chr(i) c.encode('utf-8') except: continue if len(unicodedata.normalize('NFKC', c)) != 1: continue if c in a: continue if unicodedata.normalize('NFKC', c) in a: continue try: assert ctranslitcodec.long_encode(c) == \ (unicodedata.normalize('NFKC', c), 1) assert _ctranslitcodec.long_encode(c) == c except: print('FAILED AT', i, c) print(ctranslitcodec.long_encode(c), '!=',