Beispiel #1
0
def funcao_limpa_tudo(artigo):  #limpa a palavra, limita a palavra em uma
    lista_nova = []
    #logger.info('Setting it to 0, do not use it in your scoring function.')
    #print(docs)
    artigo = stem_text(artigo)
    artigo = split_alphanum(artigo)
    artigo = tokenizer.tokenize(
        artigo)  #artigo vira uma lista com todas as palavras

    list_artigo = list(artigo)  #pesquisar o que é o list
    try:
        for palavra in list_artigo:
            palavra = palavra.encode('utf-8')
            if re.match('^\d+$', palavra):
                #artigo.remove(palavra)
                pass
            elif palavra in stop_words:
                #artigo.remove(palavra)
                pass

            elif len(palavra) < 3:
                #artigo.remove(palavra)
                pass
            else:
                lista_nova.append(palavra)  #recebe palavra util
    except Exception as erro:  #evita travar o codigo e continua,
        numero_palavra = artigo.index(palavra)
        artigo.pop(numero_palavra)  #achou lixo

    del list_artigo  #garbage do python
    #log.info("Lista de Tokens: %s", docs)
    if len(lista_nova) < 5:
        return None
    else:
        return lista_nova
def transformText(text):
    text = split_alphanum(text)
    # Convert text to lower
    text = text.lower()
    text = replace_word(text)
    text = replace_numbers(text)
    text = reduce_lengthening(text)
    text = remove_alphanumerics(text)
    # Removing non ASCII chars
    text = re.sub(r'[^\x00-\x7f]', r' ', text)
    # Removing all the stopwords
    filtered_words = [word for word in text.split() if word not in STOP_WORDS]
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    # remove html markup
    text = re.sub("(<.*?>)", "", text)
    # Correct words
    spell = SpellChecker()
    misspelled = text.split()
    wordnet_lemmatizer = WordNetLemmatizer()
    for i in range(len(misspelled)):
        # Get the one `most likely` answer
        word = spell.correction(misspelled[i])
        misspelled[i] = word
        misspelled[i] = wordnet_lemmatizer.lemmatize(misspelled[i], pos="v")
        misspelled[i] = wordnet_lemmatizer.lemmatize(misspelled[i], pos="n")
    text = " ".join(misspelled)

    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    # Strip all the numerics
    #text = gensim.parsing.preprocessing.strip_numeric(text)
    return text
def raw_text_preprocess(d):
    d = re.sub(r"http\S+", "", d)
    d = strip_non_alphanum(d).lower().strip()
    d = split_alphanum(d)
    d = strip_short(d, minsize=2)
    d = strip_numeric(d)
    d = ViTokenizer.tokenize(d)
    return d
Beispiel #4
0
def raw_text_preprocess(raw):
    raw = re.sub(r"http\S+", "", raw)
    raw = strip_non_alphanum(raw).lower().strip()
    raw = split_alphanum(raw)
    raw = strip_short(raw, minsize=2)
    raw = strip_numeric(raw)
    raw = ViTokenizer.tokenize(raw)
    return raw
Beispiel #5
0
def preprocess_mail(mail):
    mail = re.sub("https\S+", "", mail)  # Loại bỏ các đường dẫn
    mail = strip_non_alphanum(mail).lower().strip(
    )  # Loại bỏ các kí tự không phải là chữ cái, chuyển tất cả kí tự thành chữ thường
    mail = split_alphanum(mail)  # Tách văn bản thành các từ
    mail = strip_short(
        mail, minsize=2
    )  # Lấy các từ có độ dài >= 2 kí tự, loại bỏ các từ có 1 chữ cái
    mail = strip_numeric(mail)  #
    mail = ViTokenizer.tokenize(mail)
    return mail
Beispiel #6
0
 def convert_num_words(cls, text):
     """
         This method will numbers to text which help preprocessing much easier.
     """
     text = split_alphanum(text)
     temp_str = text.split()
     new_string = [
         INFLECT_ENGINE.number_to_words(word) if word.isdigit() else word
         for word in temp_str
     ]
     temp_str = " ".join(new_string)
     return temp_str
Beispiel #7
0
def preprocess_text(text):
    text = parse_html_v2(text)
    text = text.lower()
    text = remove_links_content(text)
    text = remove_emails(text)
    text = remove_special_tags(text)  # remove content between {}
    text = remove_punctuation(text)  # remove all puntuations
    text = split_alphanum(text)  # add space between word and numeric
    text = strip_numeric(text)  # remove digits
    text = strip_non_alphanum(text)  # remove non-alphabetic characters
    text = strip_short(text, minsize=2)  # remove word with length < minsize
    text = remove_multiple_space(text).strip()  # remove space and strip
    text = ViTokenizer.tokenize(text)
    return text
def gensim_clean_string(textIn, _strip_tags=True, _split_alphanumeric=True, _strip_nonalphanumeric=True,
                        _strip_muliple_whitespace=True, _strip_short=True, _short_charcount_min=3,
                        _strip_punctuation=False, _convert_to_lower = False):
    cleaner = textIn
    if _strip_tags:
        cleaner = strip_tags(textIn)
    if _strip_nonalphanumeric:
        cleaner = strip_non_alphanum(cleaner)
    if _strip_muliple_whitespace:
        cleaner = strip_multiple_whitespaces(cleaner)
    if _split_alphanumeric:
        cleaner = split_alphanum(cleaner)
    if _strip_short:
        cleaner = strip_short(cleaner, minsize=_short_charcount_min)
    if _convert_to_lower:
        cleaner = cleaner.lower()


    return cleaner
Beispiel #9
0
def transformText(text):
    text = split_alphanum(text)
    # Convert text to lower
    text = text.lower()
    text = replace_word(text)
    text = normaliser_word(text)
    #stops = set(stopwords.words("english"))
    stops={'at', 'only', 'your', 'yourself', 'a', 'i', 'during', 'off', 'myself', 'so', 'o', 'after', 'under', 
           'there', 'against', 'over', 'ourselves', 'they', 'me', 'its', 'then', 'above', 'theirs', 'this', 'into', 
           'from', 'very', 'on', 'yours', 'yourselves', 'herself', 'themselves', 'between', 'if', 'below', 'own', 
           'and', 'you', 'itself', 'him', 'while', 's', 'who', 'we', 'what', 'by', 'ma', 'further', 'such', 'until',
           'through', 'too', 'until', 'through', 't', 'too', 'where', 'up', 'my', 'm', 'out', 'down', 're', 'to', 
           'she', 'd', 'those', 'when', 'it', 'because', 'he', 'in', 'other','each', 'both', 'her', 'but', 'as', 'all', 
           'his', 'again', 'with', 'once', 'am', 'just', 'should', 'why', 'than', 'any', 'should', 'why', 'than',
           'more', 'most', 'that', 've', 'will', 'ours', 'our', 'll', 'the', 'y', 'which', 'whom', 'hers', 'an', 'here',
           'how', 'before', 'about', 'for', 'them', 'these', 'their', 'for', 'them', 'these', 'their', 'or', 'must', 
           'shall', 'would', 'could' , 'need', 'might'}
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    # Removing all the stopwords
    filtered_words = [word for word in text.split() if word not in stops]
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    # Correct words
    spell = SpellChecker()
    misspelled = text.split()
    wordnet_lemmatizer = WordNetLemmatizer()
    for i in range(len(misspelled)):
        # Get the one `most likely` answer
        word = spell.correction(misspelled[i])
        misspelled[i]=word
        misspelled[i] = wordnet_lemmatizer.lemmatize(misspelled[i], pos="v")
        misspelled[i] = wordnet_lemmatizer.lemmatize(misspelled[i], pos="n")
    text = " ".join(misspelled)
    # Removing all the stopwords
    filtered_words = [word for word in text.split() if word not in stops]
    text = " ".join(filtered_words)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    return text
def text_preprocess(bodyItem): # bodyItem: string (of one mail)  => return: list of words (of one mail)
    # Remove http, https
    bodyItem = re.sub(r'^https?:\/\/.*[\r\n]*', '', bodyItem, flags=re.MULTILINE)
    bodyItem = re.sub(r'^http?:\/\/.*[\r\n]*', '', bodyItem, flags=re.MULTILINE)
    bodyItem = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", bodyItem)
    # Decode some bodyItems which are not decoded
    bodyItem = bodyItem.replace("=", "%")
    bodyItem = urllib.parse.unquote(bodyItem)
    # Remove a word which has numbers and alphabets
    bodyItem = strip_number_alphabets(bodyItem)
    # Remove meaningless words, convert to lower words and split meaningful words 
    bodyItem = strip_non_alphanum(bodyItem).lower().strip()
    bodyItem = split_alphanum(bodyItem)
    # Join two words which have meaning in Vietnamese. Ex: hội thảo -> hội_thảo
    bodyItem = ViTokenizer.tokenize(bodyItem)
    # Remove a word which has one letter
    bodyItem = strip_short(bodyItem, minsize=2)
    # Remove stopwords
    words = [word for word in bodyItem.split() if word not in stopwordsVN_ENG.getStopwordsVN_ENG()]
    return words
Beispiel #11
0
def funcao_limpa_tudo(artigo):
    tokenizer = RegexpTokenizer(r'\w+')
    lista_nova = []
    #logger.info('Setting it to 0, do not use it in your scoring function.')
    #print(docs)
    artigo = stem_text(artigo)
    artigo = split_alphanum(artigo)
    artigo = tokenizer.tokenize(artigo)

    list_artigo = list(artigo)
    try:
        for palavra in list_artigo:
            palavra = palavra.encode('utf-8')
            if re.match('^\d+$', palavra):
                #artigo.remove(palavra)
                pass
            elif palavra in pt_stop:
                #artigo.remove(palavra)
                pass

            elif len(palavra) < 3:
                #artigo.remove(palavra)
                pass
            else:
                lista_nova.append(palavra)
    except Exception as erro:
        print(erro)
        #numero_palavra = artigo.index(palavra)
        #artigo.pop(numero_palavra) #achou lixo
        pass
    del list_artigo
    del artigo
    #log.info("Lista de Tokens: %s", docs)
    if len(lista_nova) < 5:
        return None
    else:
        return lista_nova
Beispiel #12
0
 def test_split_alphanum(self):
     self.assertEqual(split_alphanum("toto diet1 titi"), "toto diet 1 titi")
     self.assertEqual(split_alphanum("toto 1diet titi"), "toto 1 diet titi")
 def testSplitAlphanum(self):
     self.assertEqual(split_alphanum("toto diet1 titi"), "toto diet 1 titi")
     self.assertEqual(split_alphanum("toto 1diet titi"), "toto 1 diet titi")
def gensim_clean_string(textIn):
    cleaner = strip_tags(textIn)
    cleaner = split_alphanum(cleaner)
    cleaner = strip_multiple_whitespaces(cleaner)
    cleaner = strip_short(cleaner, minsize=3)
    return cleaner
Beispiel #15
0
def transformText(text):
    #pour séparer les numéros et les phrases
    text = split_alphanum(text)
    # Convert text to lower
    text = text.lower()
    text = replace_word(text)
    text = replace_numbers(text)
    text = reduce_lengthening(text)
    text = remove_alphanumerics(text)

    #stops = set(stopwords.words("english"))
    stopword = {
        'only', 'being', 'yourselves', 'out', 'nor', 'few', 'now', 'd',
        'before', 'their', 'and', 'ourselves', 'doing', 'its', "you've",
        'yourself', 'off', 'do', 'm', 'but', 'after', 'during', 'has', 'for',
        'above', 'y', 'on', 've', 'itself', 'been', 'until', 'we', 'shan',
        're', 'll', 'o', 'did', 'there', 'you', 'further', 'some', 'where',
        'through', 'doesn', 'a', 'it', 'does', 'who', 'most', 'she', 'each',
        'am', 'if', 'hadn', 'him', 'the', 'those', 'will', "you'll", 'any',
        'why', 'weren', 'ours', 'that', 'other', 'own', 'once', 'was', 'from',
        'this', 'at', 'such', 'than', 'between', 'because', 'while', 'when',
        'so', 'himself', 'them', 'didn', 's', 'ma', 'shouldn', 'my', 'only',
        'then', 'herself', 'under', 'theirs', 'about', 'here', 'his', 'were',
        'be', 'with', "you'd", 'are', 'both', 'haven', 'of', 'over', 'won',
        'below', 'down', 'ain', 'or', 'to', 'too', 'again', 'an', 'needn',
        'what', 'aren', "should've", 'me', 'hers', "that'll", 'just', 'same',
        'which', "you're", 'having', "it's", 'can', 'these', 'more', 'i', 'up',
        'yours', 'your', 'themselves', 'by', 'don', 'wasn', 'how', 'mightn',
        'as', 'all', 'mustn', 'into', 'in', 'is', 't', 'he', 'had', 'her',
        'our', 'they', 'have', 'against', 'whom', 'isn', "she's", 'myself'
    }
    # Convert text to lower
    text = text.lower()
    # Removing non ASCII chars
    text = re.sub(r'[^\x00-\x7f]', r' ', text)

    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)

    # Removing all the stopwords
    filtered_words = [word for word in text.split() if word not in stopword]

    #retourner les noms pluriels au singulier et les verbes à l'infinitif:limatisation
    input_str = word_tokenize(text)
    for i in range(len(input_str)):
        input_str[i] = lemmatizer.lemmatize(input_str[i], pos='v')
        input_str[i] = lemmatizer.lemmatize(input_str[i], pos='n')
    #correction d'orthographe
    spell = SpellChecker()
    misspelled = text.split()
    for i in range(len(misspelled)):
        # Get the one `most likely` answer
        word = spell.correction(misspelled[i])
        misspelled[i] = word
    text = " ".join(misspelled)

    # Removing all the tokens with lesser than 3 characters
    filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words,
                                                            minsize=3)

    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)

    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)

    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)

    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)

    # Stemming
    return gensim.parsing.preprocessing.stem_text(text)