def preprocess_string(string, strip_accents=True): string = string.lower() if strip_accents: string = strip_accents_ascii(string) pattern = re.compile('[^a-z0-9]+', re.UNICODE) string = pattern.sub(' ', string) return string
def _preprocess(self, doc): if self.input == "content": pass elif self.input == "filename": with open(doc, "r", encoding=self.encoding, errors=self.decode_error) as fh: doc = fh.read() elif self.input == "file": doc = doc.read() if isinstance(doc, bytes): doc = doc.decode(self.encoding, self.decode_error) if self.strip_accents is not None: if self.strip_accents == "unicode": doc = strip_accents_unicode(doc) elif self.strip_accents == "ascii": doc = strip_accents_ascii(doc) else: raise ValueError('Invalid value for "strip_accents": %s' % self.strip_accents) if self.analyzer == "char" and self._compat_mode(): doc = self._white_spaces.sub(" ", doc) return doc
def remove_words_present_in_one_doc(locations, vocab_filepath, config): text_files = [get_text_from_files(l) for l in locations] texts = [] for files in text_files: texts.extend([strip_accents_ascii(x) for _, x in files]) vocab_file = open(vocab_filepath, "r") vocab = set([line.strip('\n') for line in vocab_file.readlines()]) seen_once = set() seen_atleast_twice = set() tok_re = re.compile(re_pattern_tok) for text in texts: toks = tok_re.findall(text) toks = [t.lower() for t in toks] for t in set(toks): if t in seen_atleast_twice: continue elif t in seen_once: seen_once.remove(t) seen_atleast_twice.add(t) else: seen_once.add(t) logging.info("Rempved {} tokens".format(len(seen_once))) vocab.difference_update(seen_once) with open(vocab_filepath, "w+") as out_vocab: out_vocab.write("\n".join(vocab))
def normalize(text): text = text.decode('utf-8') text = re.sub(r'[a-zA-z]+://[^\s]*', '', text) text = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', '', text) text = strip_accents_ascii(text) text = text.encode('utf-8') text = ' '.join(map(lambda x: x.lower(), TreebankWordTokenizer().tokenize(text))) return text
def normalize_text(text): """Basic normalization without altering the semantic""" if isinstance(text, str): # strip_accents_ascii requires unicode test text = text.decode('utf-8') text = strip_accents_ascii(text) text = remove_non_ascii(text) return text
def normalize_text(text): """Basic normalization without altering the semantic""" if isinstance(text, str): # strip_accents_ascii requires unicode test text = text.decode("utf-8") text = strip_accents_ascii(text) text = remove_non_ascii(text) return text
def normalize(text): text = strip_accents_ascii(text.decode('utf-8')) text = text.encode('utf-8') text = ' '.join( map(lambda x: x.lower(), TreebankWordTokenizer().tokenize(text))) text = str(TextBlob(text).correct()) return text
def test_to_ascii(): # check some classical latin accentuated symbols a = '\xe0\xe1\xe2\xe3\xe4\xe5\xe7\xe8\xe9\xea\xeb' expected = 'aaaaaaceeee' assert_equal(strip_accents_ascii(a), expected) a = '\xec\xed\xee\xef\xf1\xf2\xf3\xf4\xf5\xf6\xf9\xfa\xfb\xfc\xfd' expected = 'iiiinooooouuuuy' assert_equal(strip_accents_ascii(a), expected) # check some arabic a = '\u0625' # halef with a hamza below expected = '' # halef has no direct ascii match assert_equal(strip_accents_ascii(a), expected) # mix letters accentuated and not a = "this is \xe0 test" expected = 'this is a test' assert_equal(strip_accents_ascii(a), expected)
def test_to_ascii(): # check some classical latin accentuated symbols a = 'àáâãäåçèéêë' expected = 'aaaaaaceeee' assert strip_accents_ascii(a) == expected a = "ìíîïñòóôõöùúûüý" expected = 'iiiinooooouuuuy' assert strip_accents_ascii(a) == expected # check some arabic a = '\u0625' # halef with a hamza below expected = '' # halef has no direct ascii match assert strip_accents_ascii(a) == expected # mix letters accentuated and not a = "this is à test" expected = 'this is a test' assert strip_accents_ascii(a) == expected
def test_to_ascii(): # check some classical latin accentuated symbols a = u'\xe0\xe1\xe2\xe3\xe4\xe5\xe7\xe8\xe9\xea\xeb' expected = u'aaaaaaceeee' assert_equal(strip_accents_ascii(a), expected) a = u'\xec\xed\xee\xef\xf1\xf2\xf3\xf4\xf5\xf6\xf9\xfa\xfb\xfc\xfd' expected = u'iiiinooooouuuuy' assert_equal(strip_accents_ascii(a), expected) # check some arabic a = u'\u0625' # halef with a hamza below expected = u'' # halef has no direct ascii match assert_equal(strip_accents_ascii(a), expected) # mix letters accentuated and not a = u"this is \xe0 test" expected = u'this is a test' assert_equal(strip_accents_ascii(a), expected)
def test_to_ascii(): # check some classical latin accentuated symbols a = 'àáâãäåçèéêë' expected = 'aaaaaaceeee' assert_equal(strip_accents_ascii(a), expected) a = "ìíîïñòóôõöùúûüý" expected = 'iiiinooooouuuuy' assert_equal(strip_accents_ascii(a), expected) # check some arabic a = '\u0625' # halef with a hamza below expected = '' # halef has no direct ascii match assert_equal(strip_accents_ascii(a), expected) # mix letters accentuated and not a = "this is à test" expected = 'this is a test' assert_equal(strip_accents_ascii(a), expected)
def claims_processor(s, numbers=False): # Lowercase s = s.lower() # Get rid of numbers in patents if numbers is False: s = re.sub(num, '', s) if s else None # URLs and ASCII only s = re.sub(links, '', s) s = strip_accents_ascii(s) s = strip_tags(s) return s
def preprocess_product_name(text: str, lower: bool, strip_accent: bool, remove_punct: bool, remove_digit: bool) -> str: if strip_accent: text = strip_accents_ascii(text) if lower: text = text.lower() if remove_punct: text = PUNCTUATION_REGEX.sub(" ", text) if remove_digit: text = DIGIT_REGEX.sub(" ", text) return MULTIPLE_SPACES_REGEX.sub(" ", text)
def _preprocess_word( self, word: str, preprocessor_args: PreprocessorArgs = { 'strip_accents': False, 'lowercase': False, 'preprocessor': None, } ) -> str: """pre-processes a word before it is searched in the model's vocabulary. Parameters ---------- word : str Word to be preprocessed. preprocessor_args : PreprocessorArgs, optional Dictionary with arguments that specifies how the words will be preprocessed, by default { 'strip_accents': False, 'lowercase': False, 'preprocessor': None, } Returns ------- str The pre-processed word according to the given parameters. """ preprocessor = preprocessor_args.get('preprocessor', None) if preprocessor and callable(preprocessor): word = preprocessor(word) else: if preprocessor_args.get('lowercase', False): word = word.lower() strip_accents = preprocessor_args.get('strip_accents', False) if strip_accents == True: word = strip_accents_unicode(word) elif strip_accents == 'ascii': word = strip_accents_ascii(word) elif strip_accents == 'unicode': word = strip_accents_unicode(word) if self.vocab_prefix is not None: word = self.vocab_prefix + word return word
def readme_processor(s): # Capitalization won't help us s = s.lower() # Remove code and markdown headlines s = re.sub(code_ticks, '', s) s = re.sub(headlines, '', s) s = re.sub(md_links, '', s) s = re.sub(links, '', s) # ASCII our text and remove html tags s = strip_accents_ascii(s) s = strip_tags(s) # Underscores imply variable names, which are # never useful. Get rid of anything in camelcase? s = re.sub(underscore, '', s) return s
def remove_words_not_present(locations, vocab_filepath, config={}): text_files = [get_text_from_files(l) for l in locations] texts = [] for files in text_files: texts.extend([strip_accents_ascii(x) for _, x in files]) vocab_file = open(vocab_filepath, "r") vocab = set([line.strip('\n') for line in vocab_file.readlines()]) orig_size = len(vocab) result_vocab = set() tok_re = re.compile(re_pattern_tok) for text in texts: toks = tok_re.findall(text) toks = [t.lower() for t in toks] for t in set(toks): if t in vocab: result_vocab.add(t) vocab.remove(t) logging.info("Removed {} toks".format(orig_size - len(result_vocab))) with open(vocab_filepath, "w+") as out_vocab: out_vocab.write("\n".join(result_vocab))
def clean_text(text): text = text.lower() text = strip_accents_ascii(text.decode('utf-8')) return text
def normalize(text): text = strip_accents_ascii(text) text = map(lambda x: x.lower(), TreebankWordTokenizer().tokenize(text)) return text
def lowercase_strip_accents_and_ownership(doc): lowercase_no_accents_doc = strip_accents_ascii(doc.lower()) txt = lowercase_no_accents_doc.replace('"', '').replace("\'s", "").replace( "\'ve", " have").replace("\'re", " are").replace("\'", "").strip("`").strip() return txt
def test_vectorizer(): # raw documents as an iterator train_data = iter(ALL_FOOD_DOCS[:-1]) test_data = [ALL_FOOD_DOCS[-1]] n_train = len(ALL_FOOD_DOCS) - 1 # test without vocabulary v1 = CountVectorizer(max_df=0.5) counts_train = v1.fit_transform(train_data) if hasattr(counts_train, 'tocsr'): counts_train = counts_train.tocsr() assert counts_train[0, v1.vocabulary_["pizza"]] == 2 # build a vectorizer v1 with the same vocabulary as the one fitted by v1 v2 = CountVectorizer(vocabulary=v1.vocabulary_) # compare that the two vectorizer give the same output on the test sample for v in (v1, v2): counts_test = v.transform(test_data) if hasattr(counts_test, 'tocsr'): counts_test = counts_test.tocsr() vocabulary = v.vocabulary_ assert counts_test[0, vocabulary["salad"]] == 1 assert counts_test[0, vocabulary["tomato"]] == 1 assert counts_test[0, vocabulary["water"]] == 1 # stop word from the fixed list assert "the" not in vocabulary # stop word found automatically by the vectorizer DF thresholding # words that are high frequent across the complete corpus are likely # to be not informative (either real stop words of extraction # artifacts) assert "copyright" not in vocabulary # not present in the sample assert counts_test[0, vocabulary["coke"]] == 0 assert counts_test[0, vocabulary["burger"]] == 0 assert counts_test[0, vocabulary["beer"]] == 0 assert counts_test[0, vocabulary["pizza"]] == 0 # test tf-idf t1 = TfidfTransformer(norm='l1') tfidf = t1.fit(counts_train).transform(counts_train).toarray() assert len(t1.idf_) == len(v1.vocabulary_) assert tfidf.shape == (n_train, len(v1.vocabulary_)) # test tf-idf with new data tfidf_test = t1.transform(counts_test).toarray() assert tfidf_test.shape == (len(test_data), len(v1.vocabulary_)) # test tf alone t2 = TfidfTransformer(norm='l1', use_idf=False) tf = t2.fit(counts_train).transform(counts_train).toarray() assert not hasattr(t2, "idf_") # test idf transform with unlearned idf vector t3 = TfidfTransformer(use_idf=True) with pytest.raises(ValueError): t3.transform(counts_train) # test idf transform with incompatible n_features X = [[1, 1, 5], [1, 1, 0]] t3.fit(X) X_incompt = [[1, 3], [1, 3]] with pytest.raises(ValueError): t3.transform(X_incompt) # L1-normalized term frequencies sum to one assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train) # test the direct tfidf vectorizer # (equivalent to term count vectorizer + tfidf transformer) train_data = iter(ALL_FOOD_DOCS[:-1]) tv = TfidfVectorizer(norm='l1') tv.max_df = v1.max_df tfidf2 = tv.fit_transform(train_data).toarray() assert not tv.fixed_vocabulary_ assert_array_almost_equal(tfidf, tfidf2) # test the direct tfidf vectorizer with new data tfidf_test2 = tv.transform(test_data).toarray() assert_array_almost_equal(tfidf_test, tfidf_test2) # test transform on unfitted vectorizer with empty vocabulary v3 = CountVectorizer(vocabulary=None) with pytest.raises(ValueError): v3.transform(train_data) # ascii preprocessor? v3.set_params(strip_accents='ascii', lowercase=False) processor = v3.build_preprocessor() text = ("J'ai mangé du kangourou ce midi, " "c'était pas très bon.") expected = strip_accents_ascii(text) result = processor(text) assert expected == result # error on bad strip_accents param v3.set_params(strip_accents='_gabbledegook_', preprocessor=None) with pytest.raises(ValueError): v3.build_preprocessor() # error with bad analyzer type v3.set_params = '_invalid_analyzer_type_' with pytest.raises(ValueError): v3.build_analyzer()
def lowercase_strip_accents_and_ownership(doc): lowercase_no_accents_doc = strip_accents_ascii(doc.lower()) return lowercase_no_accents_doc.replace("'s", "")
ver vez vezes viagem vindo vinte você vocês vos vós vossa vossas vosso vossos zero""".split('\n') stopwords = set([text.strip_accents_ascii(w) for w in stopwords]) class Classificar(object): pass import hashlib class Classificacao(object): def rodar(self, idioma, matriz=False, balancear=False): preparado_caminho = os.path.join(configuracao.DATASET_PREPARADO, idioma + '.csv') tuplas_ehspam = [] tuplas_nao_ehspam = [] comentarios = [] ehspam = [] logger.debug("abrindo arquivo")
def strip_accents(self, entry): return strip_accents_ascii(entry)
import pandas as pd from warnings import filterwarnings, warn from sklearn.feature_extraction.text import strip_accents_ascii from sklearn.feature_extraction.text import CountVectorizer from warnings import warn __doc___ = """The basic NLP tools needed for the PYQAE toolset, including parsing RTF data, basic word tokenizer and feature extraction, plus stop-words for german""" _dswl_list = 'https://gist.githubusercontent.com/kmader/bb889170010d4b9c90a4e7f66107b94b/raw/d3df37bd770d86a60f1250e675ffd6948f7bf7cc/stop_words.txt' try: with urllib.request.urlopen(_dswl_list) as resp: deutsch_stop_words = resp.read().decode().split(',') ascii_de_stop_words = [ strip_accents_ascii(x) for x in deutsch_stop_words ] except urllib.error.URLError as e: warn("Stop word list could not be loaded, using an empty list!", RuntimeWarning) deutsch_stop_words = [] ascii_de_stop_words = [] def _check_de_stop_words(): """ >>> len(ascii_de_stop_words) 1803 >>> ascii_de_stop_words[998] 'mehrmaligem' """
def preprocess(self, doc): return self.alphafilter.sub(' ', strip_accents_ascii(doc.lower()))
def preprocess_product_name(text): text = strip_accents_ascii(text) text = text.lower() text = PUNCTUATION_REGEX.sub(' ', text) text = DIGIT_REGEX.sub(' ', text) return MULTIPLE_SPACES_REGEX.sub(' ', text)