Python lemmatize Examples

Programming Language: Python

Namespace/Package Name: nltk.corpus.wordnet

Method/Function: lemmatize

Examples at hotexamples.com: 7

Python lemmatize - 7 examples found. These are the top rated real world Python examples of nltk.corpus.wordnet.lemmatize extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def prepare_text(x):

    review = re.sub('[^a-zA-Z]', ' ',
                    str(x))  # removing sepcial characters and numbers
    review = review.lower()  # lowering the text
    review = review.split()
    # removing stopwords and lemmatization

    review = [
        wn.lemmatize(word) for word in review
        if not word in set(stopwords.words('english'))
    ]
    review = ' '.join(review)

    MAX_NB_WORDS = 60000
    # Max number of words in each news.
    MAX_SEQUENCE_LENGTH = 500
    EMBEDDING_DIM = 100

    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts([review])
    word_index = tokenizer.word_index
    #print(f'Found {len(word_index)} unique tokens.')

    X = tokenizer.texts_to_sequences([review])
    X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

    return X

Example #2

Show file

File: lang_process.py Project: mafleischer/youtube-click-prediction

    def lemmatize(self):
        """Lemmatize all lists of tokens in self.processed.
        """

        if not self.processed:
            self.processed = self.title_list

        # tokenize the sentence and find the POS tag for each token
        for i in range(len(self.processed)):
            lang = LANG_MAP[self.selection_langs[i]]
            lemmatized = []
            if lang == "english":
                wordnet = self.lemmatizers[lang]
                nltk_tagged = nltk.pos_tag(self.processed[i])
                # tuple of (token, wordnet_tag)
                wordnet_tagged = map(
                    lambda x: (x[0], self._nltkTag2WordnetTag(x[1])),
                    nltk_tagged)
                for word, tag in wordnet_tagged:
                    if tag is None:
                        # if there is no available tag, append the token as is
                        lemmatized.append(word.lower())
                    else:
                        # else use the tag to lemmatize the token
                        lemmatized.append(wordnet.lemmatize(word, tag).lower())

            if lang == "german":
                hanta = self.lemmatizers[lang]
                tups_taglemmapos = hanta.tag_sent(self.processed[i])
                lemmatized = [tup[1].lower() for tup in tups_taglemmapos]

            self.processed[i] = lemmatized

Example #3

Show file

File: text_normalization.py Project: web-nutrition-project-2018/web-nutrition-server

def lemmatize_text(text):
    pos_tagged_text = pos_tag_text(text)

    if type(text) == list:
        lemmatized_tokens = [[
            wn.lemmatize(word, pos_tag) if pos_tag else word
            for word, pos_tag in t
        ] for t in pos_tagged_text]
        lemmatized_text = [' '.join(tok) for tok in lemmatized_tokens]
    else:
        lemmatized_tokens = [
            wn.lemmatize(word, pos_tag) if pos_tag else word
            for word, pos_tag in pos_tagged_text
        ]
        lemmatized_text = ' '.join(lemmatized_tokens)

    return lemmatized_text

Example #4

Show file

def lem_words(document):
    wordnet = WordNetLemmatizer()
    lemmatized_document = []

    for word in document:
        check_word = word[0].lower()
        tag = get_wordnet(word[1])
        if tag != 'Remove':
            lemmatized_document.append(str(wordnet.lemmatize(check_word, tag)))
    return lemmatized_document

Example #5

Show file

def lemmatizeWithTags(lines):
	processedLines = []
	count = 0
	for line in lines:
		words = tag(line)
		toWrite = ""
		for word in words:
			toWrite = toWrite + wn.lemmatize(word[0], penn_to_wn(word[1])) + " "
			# print('This is the original word: ' + word[0])
			# print('This is the category: ' + word[1])
			# print('And this is the lemmatized word: ' + wn.lemmatize(word[0], penn_to_wn(word[1])))
		# Remove trailing whitespace
		processedLines.append(toWrite.rstrip())
	return processedLines

Example #6

Show file

def basic_text_cleaning(line_from_column):
    # This function takes in a string, not a list or an array for the arg line_from_column

    tokenized_doc = word_tokenize(line_from_column)

    new_review = []
    for token in tokenized_doc:
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            new_review.append(new_token)

    new_term_vector = []
    for word in new_review:
        if not word in stopwords.words('english'):
            new_term_vector.append(word)

    final_doc = []
    for word in new_term_vector:
        final_doc.append(wordnet.lemmatize(word))

    return ' '.join(final_doc)

Example #7

Show file

File: classifier_comparison.py Project: SohamSarfare/Drydock

def clean(df):
    for i in range(0, df.shape[0]):
        try:
            ques = re.sub(r'^https?:\/\/.*[\r\n]*',
                          ' hyperlink ',
                          df.loc[i, 'Question'],
                          flags=re.MULTILINE)
            ques = re.sub(r'[^@\?a-zA-Z]', ' ', ques, flags=re.MULTILINE)
            ques = re.sub(r'[\?]', ' question ', ques, flags=re.MULTILINE)
            ques = re.sub(r'[@]', ' answer ', ques, flags=re.MULTILINE)
            ques = re.sub(r'[^\w]', ' ', ques, flags=re.MULTILINE)
            ques = ques.lower()
            ques = ques.split()
            wn = WordNetLemmatizer()
            temp = pos_tag(ques)
            ques = [
                wn.lemmatize(word, tag_map[tag[0]]) for (word, tag) in temp
                if word not in new_stopwords
            ]
            ques = ' '.join(ques)
            df.loc[i, 'Question'] = ques.lower().strip()
        except:
            print(i)
    return df