Beispiel #1
0
def preprocess_string(string, strip_accents=True):
    string = string.lower()
    if strip_accents:
        string = strip_accents_ascii(string)
    pattern = re.compile('[^a-z0-9]+', re.UNICODE)
    string = pattern.sub(' ', string)
    return string
Beispiel #2
0
    def _preprocess(self, doc):
        if self.input == "content":
            pass
        elif self.input == "filename":
            with open(doc,
                      "r",
                      encoding=self.encoding,
                      errors=self.decode_error) as fh:
                doc = fh.read()
        elif self.input == "file":
            doc = doc.read()

        if isinstance(doc, bytes):
            doc = doc.decode(self.encoding, self.decode_error)

        if self.strip_accents is not None:
            if self.strip_accents == "unicode":
                doc = strip_accents_unicode(doc)
            elif self.strip_accents == "ascii":
                doc = strip_accents_ascii(doc)
            else:
                raise ValueError('Invalid value for "strip_accents": %s' %
                                 self.strip_accents)

        if self.analyzer == "char" and self._compat_mode():
            doc = self._white_spaces.sub(" ", doc)

        return doc
def remove_words_present_in_one_doc(locations, vocab_filepath, config):
    text_files = [get_text_from_files(l) for l in locations]
    texts = []
    for files in text_files:
        texts.extend([strip_accents_ascii(x) for _, x in files])
    vocab_file = open(vocab_filepath, "r")
    vocab = set([line.strip('\n') for line in vocab_file.readlines()])
    seen_once = set()
    seen_atleast_twice = set()
    tok_re = re.compile(re_pattern_tok)
    for text in texts:
        toks = tok_re.findall(text)
        toks = [t.lower() for t in toks]
        for t in set(toks):
            if t in seen_atleast_twice:
                continue
            elif t in seen_once:
                seen_once.remove(t)
                seen_atleast_twice.add(t)
            else:
                seen_once.add(t)
    logging.info("Rempved {} tokens".format(len(seen_once)))
    vocab.difference_update(seen_once)
    with open(vocab_filepath, "w+") as out_vocab:
        out_vocab.write("\n".join(vocab))
def normalize(text):
    text = text.decode('utf-8')
    text = re.sub(r'[a-zA-z]+://[^\s]*', '', text)
    text = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', '', text)
    text = strip_accents_ascii(text)
    text = text.encode('utf-8')
    text = ' '.join(map(lambda x: x.lower(), TreebankWordTokenizer().tokenize(text)))
    return text
Beispiel #5
0
def normalize_text(text):
    """Basic normalization without altering the semantic"""
    if isinstance(text, str):
        # strip_accents_ascii requires unicode test
        text = text.decode('utf-8')
    text = strip_accents_ascii(text)
    text = remove_non_ascii(text)
    return text
Beispiel #6
0
def normalize_text(text):
    """Basic normalization without altering the semantic"""
    if isinstance(text, str):
        # strip_accents_ascii requires unicode test
        text = text.decode("utf-8")
    text = strip_accents_ascii(text)
    text = remove_non_ascii(text)
    return text
Beispiel #7
0
def normalize(text):
    text = strip_accents_ascii(text.decode('utf-8'))
    text = text.encode('utf-8')
    text = ' '.join(
        map(lambda x: x.lower(),
            TreebankWordTokenizer().tokenize(text)))
    text = str(TextBlob(text).correct())
    return text
Beispiel #8
0
def test_to_ascii():
    # check some classical latin accentuated symbols
    a = '\xe0\xe1\xe2\xe3\xe4\xe5\xe7\xe8\xe9\xea\xeb'
    expected = 'aaaaaaceeee'
    assert_equal(strip_accents_ascii(a), expected)

    a = '\xec\xed\xee\xef\xf1\xf2\xf3\xf4\xf5\xf6\xf9\xfa\xfb\xfc\xfd'
    expected = 'iiiinooooouuuuy'
    assert_equal(strip_accents_ascii(a), expected)

    # check some arabic
    a = '\u0625'  # halef with a hamza below
    expected = ''  # halef has no direct ascii match
    assert_equal(strip_accents_ascii(a), expected)

    # mix letters accentuated and not
    a = "this is \xe0 test"
    expected = 'this is a test'
    assert_equal(strip_accents_ascii(a), expected)
Beispiel #9
0
def test_to_ascii():
    # check some classical latin accentuated symbols
    a = 'àáâãäåçèéêë'
    expected = 'aaaaaaceeee'
    assert strip_accents_ascii(a) == expected

    a = "ìíîïñòóôõöùúûüý"
    expected = 'iiiinooooouuuuy'
    assert strip_accents_ascii(a) == expected

    # check some arabic
    a = '\u0625'  # halef with a hamza below
    expected = ''  # halef has no direct ascii match
    assert strip_accents_ascii(a) == expected

    # mix letters accentuated and not
    a = "this is à test"
    expected = 'this is a test'
    assert strip_accents_ascii(a) == expected
def test_to_ascii():
    # check some classical latin accentuated symbols
    a = u'\xe0\xe1\xe2\xe3\xe4\xe5\xe7\xe8\xe9\xea\xeb'
    expected = u'aaaaaaceeee'
    assert_equal(strip_accents_ascii(a), expected)

    a = u'\xec\xed\xee\xef\xf1\xf2\xf3\xf4\xf5\xf6\xf9\xfa\xfb\xfc\xfd'
    expected = u'iiiinooooouuuuy'
    assert_equal(strip_accents_ascii(a), expected)

    # check some arabic
    a = u'\u0625'  # halef with a hamza below
    expected = u''  # halef has no direct ascii match
    assert_equal(strip_accents_ascii(a), expected)

    # mix letters accentuated and not
    a = u"this is \xe0 test"
    expected = u'this is a test'
    assert_equal(strip_accents_ascii(a), expected)
def test_to_ascii():
    # check some classical latin accentuated symbols
    a = 'àáâãäåçèéêë'
    expected = 'aaaaaaceeee'
    assert_equal(strip_accents_ascii(a), expected)

    a = "ìíîïñòóôõöùúûüý"
    expected = 'iiiinooooouuuuy'
    assert_equal(strip_accents_ascii(a), expected)

    # check some arabic
    a = '\u0625'  # halef with a hamza below
    expected = ''  # halef has no direct ascii match
    assert_equal(strip_accents_ascii(a), expected)

    # mix letters accentuated and not
    a = "this is à test"
    expected = 'this is a test'
    assert_equal(strip_accents_ascii(a), expected)
Beispiel #12
0
def claims_processor(s, numbers=False):
    # Lowercase
    s = s.lower()

    # Get rid of numbers in patents
    if numbers is False:
        s = re.sub(num, '', s) if s else None

    # URLs and ASCII only
    s = re.sub(links, '', s)
    s = strip_accents_ascii(s)
    s = strip_tags(s)

    return s
Beispiel #13
0
def preprocess_product_name(text: str, lower: bool, strip_accent: bool,
                            remove_punct: bool, remove_digit: bool) -> str:
    if strip_accent:
        text = strip_accents_ascii(text)

    if lower:
        text = text.lower()

    if remove_punct:
        text = PUNCTUATION_REGEX.sub(" ", text)

    if remove_digit:
        text = DIGIT_REGEX.sub(" ", text)

    return MULTIPLE_SPACES_REGEX.sub(" ", text)
Beispiel #14
0
    def _preprocess_word(
        self,
        word: str,
        preprocessor_args: PreprocessorArgs = {
            'strip_accents': False,
            'lowercase': False,
            'preprocessor': None,
        }
    ) -> str:
        """pre-processes a word before it is searched in the model's vocabulary.

        Parameters
        ----------
        word : str
            Word to be preprocessed.
        preprocessor_args : PreprocessorArgs, optional
            Dictionary with arguments that specifies how the words will be preprocessed,
            by default { 
                'strip_accents': False, 
                'lowercase': False, 
                'preprocessor': None, }

        Returns
        -------
        str
            The pre-processed word according to the given parameters.
        """

        preprocessor = preprocessor_args.get('preprocessor', None)
        if preprocessor and callable(preprocessor):
            word = preprocessor(word)

        else:
            if preprocessor_args.get('lowercase', False):
                word = word.lower()

            strip_accents = preprocessor_args.get('strip_accents', False)
            if strip_accents == True:
                word = strip_accents_unicode(word)
            elif strip_accents == 'ascii':
                word = strip_accents_ascii(word)
            elif strip_accents == 'unicode':
                word = strip_accents_unicode(word)

        if self.vocab_prefix is not None:
            word = self.vocab_prefix + word

        return word
Beispiel #15
0
def readme_processor(s):
    # Capitalization won't help us
    s = s.lower()

    # Remove code and markdown headlines
    s = re.sub(code_ticks, '', s)
    s = re.sub(headlines, '', s)
    s = re.sub(md_links, '', s)
    s = re.sub(links, '', s)

    # ASCII our text and remove html tags
    s = strip_accents_ascii(s)
    s = strip_tags(s)

    # Underscores imply variable names, which are
    # never useful. Get rid of anything in camelcase?
    s = re.sub(underscore, '', s)
    return s
def remove_words_not_present(locations, vocab_filepath, config={}):
    text_files = [get_text_from_files(l) for l in locations]
    texts = []
    for files in text_files:
        texts.extend([strip_accents_ascii(x) for _, x in files])
    vocab_file = open(vocab_filepath, "r")
    vocab = set([line.strip('\n') for line in vocab_file.readlines()])
    orig_size = len(vocab)
    result_vocab = set()
    tok_re = re.compile(re_pattern_tok)
    for text in texts:
        toks = tok_re.findall(text)
        toks = [t.lower() for t in toks]
        for t in set(toks):
            if t in vocab:
                result_vocab.add(t)
                vocab.remove(t)
    logging.info("Removed {} toks".format(orig_size - len(result_vocab)))
    with open(vocab_filepath, "w+") as out_vocab:
        out_vocab.write("\n".join(result_vocab))
def clean_text(text):
    text = text.lower()
    text = strip_accents_ascii(text.decode('utf-8'))
    return text
Beispiel #18
0
def normalize(text):
    text = strip_accents_ascii(text)
    text = map(lambda x: x.lower(), TreebankWordTokenizer().tokenize(text))
    return text
Beispiel #19
0
def lowercase_strip_accents_and_ownership(doc):
    lowercase_no_accents_doc = strip_accents_ascii(doc.lower())
    txt = lowercase_no_accents_doc.replace('"', '').replace("\'s", "").replace(
        "\'ve", " have").replace("\'re",
                                 " are").replace("\'", "").strip("`").strip()
    return txt
Beispiel #20
0
def test_vectorizer():
    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, 'tocsr'):
        counts_train = counts_train.tocsr()
    assert counts_train[0, v1.vocabulary_["pizza"]] == 2

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary_)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, 'tocsr'):
            counts_test = counts_test.tocsr()

        vocabulary = v.vocabulary_
        assert counts_test[0, vocabulary["salad"]] == 1
        assert counts_test[0, vocabulary["tomato"]] == 1
        assert counts_test[0, vocabulary["water"]] == 1

        # stop word from the fixed list
        assert "the" not in vocabulary

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert "copyright" not in vocabulary

        # not present in the sample
        assert counts_test[0, vocabulary["coke"]] == 0
        assert counts_test[0, vocabulary["burger"]] == 0
        assert counts_test[0, vocabulary["beer"]] == 0
        assert counts_test[0, vocabulary["pizza"]] == 0

    # test tf-idf
    t1 = TfidfTransformer(norm='l1')
    tfidf = t1.fit(counts_train).transform(counts_train).toarray()
    assert len(t1.idf_) == len(v1.vocabulary_)
    assert tfidf.shape == (n_train, len(v1.vocabulary_))

    # test tf-idf with new data
    tfidf_test = t1.transform(counts_test).toarray()
    assert tfidf_test.shape == (len(test_data), len(v1.vocabulary_))

    # test tf alone
    t2 = TfidfTransformer(norm='l1', use_idf=False)
    tf = t2.fit(counts_train).transform(counts_train).toarray()
    assert not hasattr(t2, "idf_")

    # test idf transform with unlearned idf vector
    t3 = TfidfTransformer(use_idf=True)
    with pytest.raises(ValueError):
        t3.transform(counts_train)

    # test idf transform with incompatible n_features
    X = [[1, 1, 5],
         [1, 1, 0]]
    t3.fit(X)
    X_incompt = [[1, 3],
                 [1, 3]]
    with pytest.raises(ValueError):
        t3.transform(X_incompt)

    # L1-normalized term frequencies sum to one
    assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

    # test the direct tfidf vectorizer
    # (equivalent to term count vectorizer + tfidf transformer)
    train_data = iter(ALL_FOOD_DOCS[:-1])
    tv = TfidfVectorizer(norm='l1')

    tv.max_df = v1.max_df
    tfidf2 = tv.fit_transform(train_data).toarray()
    assert not tv.fixed_vocabulary_
    assert_array_almost_equal(tfidf, tfidf2)

    # test the direct tfidf vectorizer with new data
    tfidf_test2 = tv.transform(test_data).toarray()
    assert_array_almost_equal(tfidf_test, tfidf_test2)

    # test transform on unfitted vectorizer with empty vocabulary
    v3 = CountVectorizer(vocabulary=None)
    with pytest.raises(ValueError):
        v3.transform(train_data)

    # ascii preprocessor?
    v3.set_params(strip_accents='ascii', lowercase=False)
    processor = v3.build_preprocessor()
    text = ("J'ai mangé du kangourou  ce midi, "
            "c'était pas très bon.")
    expected = strip_accents_ascii(text)
    result = processor(text)
    assert expected == result

    # error on bad strip_accents param
    v3.set_params(strip_accents='_gabbledegook_', preprocessor=None)
    with pytest.raises(ValueError):
        v3.build_preprocessor()

    # error with bad analyzer type
    v3.set_params = '_invalid_analyzer_type_'
    with pytest.raises(ValueError):
        v3.build_analyzer()
Beispiel #21
0
def lowercase_strip_accents_and_ownership(doc):
    lowercase_no_accents_doc = strip_accents_ascii(doc.lower())
    return lowercase_no_accents_doc.replace("'s", "")
Beispiel #22
0
def clean_text(text):
    text = text.lower()
    text = strip_accents_ascii(text.decode('utf-8'))
    return text
Beispiel #23
0
ver
vez
vezes
viagem
vindo
vinte
você
vocês
vos
vós
vossa
vossas
vosso
vossos
zero""".split('\n')
stopwords = set([text.strip_accents_ascii(w) for w in stopwords])

class Classificar(object):
    pass

import hashlib
class Classificacao(object):

    def rodar(self, idioma, matriz=False, balancear=False):
        preparado_caminho = os.path.join(configuracao.DATASET_PREPARADO, idioma + '.csv')
        tuplas_ehspam = []
        tuplas_nao_ehspam = []
        comentarios = []
        ehspam = []
        logger.debug("abrindo arquivo")
Beispiel #24
0
 def strip_accents(self, entry):
     return strip_accents_ascii(entry)
Beispiel #25
0
import pandas as pd
from warnings import filterwarnings, warn
from sklearn.feature_extraction.text import strip_accents_ascii
from sklearn.feature_extraction.text import CountVectorizer
from warnings import warn

__doc___ = """The basic NLP tools needed for the PYQAE toolset, including
parsing RTF data, basic word tokenizer and feature extraction,
plus stop-words for german"""

_dswl_list = 'https://gist.githubusercontent.com/kmader/bb889170010d4b9c90a4e7f66107b94b/raw/d3df37bd770d86a60f1250e675ffd6948f7bf7cc/stop_words.txt'
try:
    with urllib.request.urlopen(_dswl_list) as resp:
        deutsch_stop_words = resp.read().decode().split(',')
        ascii_de_stop_words = [
            strip_accents_ascii(x) for x in deutsch_stop_words
        ]
except urllib.error.URLError as e:
    warn("Stop word list could not be loaded, using an empty list!",
         RuntimeWarning)
    deutsch_stop_words = []
    ascii_de_stop_words = []


def _check_de_stop_words():
    """
    >>> len(ascii_de_stop_words)
    1803
    >>> ascii_de_stop_words[998]
    'mehrmaligem'
    """
 def preprocess(self, doc):
     return self.alphafilter.sub(' ', strip_accents_ascii(doc.lower()))
Beispiel #27
0
def preprocess_product_name(text):
    text = strip_accents_ascii(text)
    text = text.lower()
    text = PUNCTUATION_REGEX.sub(' ', text)
    text = DIGIT_REGEX.sub(' ', text)
    return MULTIPLE_SPACES_REGEX.sub(' ', text)