Esempio n. 1
0
def _get_wordnet():
    try:
        wn = wordnet.WordNetLemmatizer()
        wn.lemmatize("this is a test")
    except:
        print("Missing wordnet data -- attempting to download")
        import nltk

        nltk.download("wordnet")
        wn = wordnet.WordNetLemmatizer()
    return wn
Esempio n. 2
0
def preprocess_tweets(df):
    corpus = []
    stem = PorterStemmer()
    lem = wordnet.WordNetLemmatizer()
    for tweet in df['message']:
        words = [w for w in word_tokenize(tweet) if ((w.lower() not in stop) and username_filter(w))]
        words = [lem.lemmatize(w) for w in words if len(w) > 2]
        corpus.append(words)
    return corpus
Esempio n. 3
0
def lemmatize1(essay):
    lemma = wn.WordNetLemmatizer()
    pos_list = pos_tag(essay)
    cleaned_essay = []
    for tag in pos_list:
        if '@' not in tag[0]:
            cleaned_essay.append(
                lemma.lemmatize(tag[0], get_wordnet_pos(tag[1])))
        # if word[1]!='NNP' or word[0][0]!="@":
        #     essay.remove(essay[ind])
    return cleaned_essay
Esempio n. 4
0
def passive_voice_detection(sentence):
    lemma = wn.WordNetLemmatizer()
    error = 0
    for i in range(len(sentence) - 1):
        if sentence[i][1][0] == 'V':
            check = lemma.lemmatize(sentence[i][0], pos="v")
            if check == 'be' and sentence[
                    i + 1][1][0] == 'V' and sentence[i + 1][1] != 'VBG':
                error = 1
                break
    return error
Esempio n. 5
0
def lemmatize(essay):
    lemma = wn.WordNetLemmatizer()
    lemma_word = []
    for word in essay:
        if '@' not in word:
            word1 = lemma.lemmatize(word, pos="n")
            word2 = lemma.lemmatize(word1, pos="v")
            word3 = lemma.lemmatize(word2, pos="a")
            lemma_word.append(word3)

    #print()
    # print('lemmatization')
    # print(lemma_word)
    return lemma_word
Esempio n. 6
0
def nlp_preprocess_text(text_proc):
    lemmatizer = wordnet.WordNetLemmatizer()
    text_proc = standardize_new_text(text_proc)
    cleaned_tokens = []
    tokens = word_tokenize(text_proc.lower())
    for token in tokens:
        #  if token not in stop_words:
        if len(token) > 0 and len(token) < 20:  # removes non words
            if not token[0].isdigit() and not token[-1].isdigit(
            ):  # removes numbers
                token = spell(token)
                lemmed_tokens = lemmatizer.lemmatize(token)
                cleaned_tokens.append(lemmed_tokens)

    text_nlp_proc = ' '.join(wd for wd in cleaned_tokens)

    return text_nlp_proc
Esempio n. 7
0
def token_lemma_stem(text):
    # 1. tokenize words
    tokenized_words = word_tokenize(preprocess(text))

    tokenized_words = [
        w.lower() for w in tokenized_words if w.lower() not in pass_words
    ]

    # 2. lemmatize words
    lemmatizer = wordnet.WordNetLemmatizer()
    tokenized_words = [
        lemmatizer.lemmatize(w.lower()) for w in tokenized_words
    ]

    # 3. stem words with PorterStemmer
    porter = PorterStemmer()
    tokenized_words = [porter.stem(w) for w in tokenized_words]
    return tokenized_words
Esempio n. 8
0
def text_normalization(text):
    text = str(text).lower()
    spl_char_text = re.sub(r'[^ a-z]', '', text)
    tokens = nltk.word_tokenize(spl_char_text)
    lema = wordnet.WordNetLemmatizer()
    tags_list = pos_tag(tokens, tagset=None)
    lema_words = []
    for token, pos_token in tags_list:
        if pos_token.startswith('V'):
            pos_val = 'v'
        elif pos_token.startswith('J'):
            pos_val = 'a'
        elif pos_token.startswith('R'):
            pos_val = 'r'
        else:
            pos_val = 'n'
        lema_token = lema.lemmatize(token, pos_val)
        lema_words.append(lema_token)

    return " ".join(lema_words)
Esempio n. 9
0
def fct_nltk(text, stop_words):
    """
    Fonction pour supprimer :
        les step words
        la ponctuation
        les majuscules
        les pluriels
    """

    # Création de l'objet
    lemma = wordnet.WordNetLemmatizer()

    # Tokenization et mise en minuscule
    words = word_tokenize(text.lower())

    # Suppression des pluriels et de la ponctuation. Boule pour toutes les lignes
    new_sentence = [
        lemma.lemmatize(x) for x in words
        if (not x in stop_words) and x.isalpha()
    ]

    # Sortie
    return new_sentence
Esempio n. 10
0
def reduz_ao_radical_lem(lista_palavras):
    lemmatizer = wordnet.WordNetLemmatizer()
    for i in range(len(lista_palavras)):
        lista_palavras[i] = lemmatizer.lemmatize(lista_palavras[i])
    return lista_palavras