def prepare_text(x): review = re.sub('[^a-zA-Z]', ' ', str(x)) # removing sepcial characters and numbers review = review.lower() # lowering the text review = review.split() # removing stopwords and lemmatization review = [ wn.lemmatize(word) for word in review if not word in set(stopwords.words('english')) ] review = ' '.join(review) MAX_NB_WORDS = 60000 # Max number of words in each news. MAX_SEQUENCE_LENGTH = 500 EMBEDDING_DIM = 100 tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts([review]) word_index = tokenizer.word_index #print(f'Found {len(word_index)} unique tokens.') X = tokenizer.texts_to_sequences([review]) X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH) return X
def lemmatize(self): """Lemmatize all lists of tokens in self.processed. """ if not self.processed: self.processed = self.title_list # tokenize the sentence and find the POS tag for each token for i in range(len(self.processed)): lang = LANG_MAP[self.selection_langs[i]] lemmatized = [] if lang == "english": wordnet = self.lemmatizers[lang] nltk_tagged = nltk.pos_tag(self.processed[i]) # tuple of (token, wordnet_tag) wordnet_tagged = map( lambda x: (x[0], self._nltkTag2WordnetTag(x[1])), nltk_tagged) for word, tag in wordnet_tagged: if tag is None: # if there is no available tag, append the token as is lemmatized.append(word.lower()) else: # else use the tag to lemmatize the token lemmatized.append(wordnet.lemmatize(word, tag).lower()) if lang == "german": hanta = self.lemmatizers[lang] tups_taglemmapos = hanta.tag_sent(self.processed[i]) lemmatized = [tup[1].lower() for tup in tups_taglemmapos] self.processed[i] = lemmatized
def lemmatize_text(text): pos_tagged_text = pos_tag_text(text) if type(text) == list: lemmatized_tokens = [[ wn.lemmatize(word, pos_tag) if pos_tag else word for word, pos_tag in t ] for t in pos_tagged_text] lemmatized_text = [' '.join(tok) for tok in lemmatized_tokens] else: lemmatized_tokens = [ wn.lemmatize(word, pos_tag) if pos_tag else word for word, pos_tag in pos_tagged_text ] lemmatized_text = ' '.join(lemmatized_tokens) return lemmatized_text
def lem_words(document): wordnet = WordNetLemmatizer() lemmatized_document = [] for word in document: check_word = word[0].lower() tag = get_wordnet(word[1]) if tag != 'Remove': lemmatized_document.append(str(wordnet.lemmatize(check_word, tag))) return lemmatized_document
def lemmatizeWithTags(lines): processedLines = [] count = 0 for line in lines: words = tag(line) toWrite = "" for word in words: toWrite = toWrite + wn.lemmatize(word[0], penn_to_wn(word[1])) + " " # print('This is the original word: ' + word[0]) # print('This is the category: ' + word[1]) # print('And this is the lemmatized word: ' + wn.lemmatize(word[0], penn_to_wn(word[1]))) # Remove trailing whitespace processedLines.append(toWrite.rstrip()) return processedLines
def basic_text_cleaning(line_from_column): # This function takes in a string, not a list or an array for the arg line_from_column tokenized_doc = word_tokenize(line_from_column) new_review = [] for token in tokenized_doc: new_token = regex.sub(u'', token) if not new_token == u'': new_review.append(new_token) new_term_vector = [] for word in new_review: if not word in stopwords.words('english'): new_term_vector.append(word) final_doc = [] for word in new_term_vector: final_doc.append(wordnet.lemmatize(word)) return ' '.join(final_doc)
def clean(df): for i in range(0, df.shape[0]): try: ques = re.sub(r'^https?:\/\/.*[\r\n]*', ' hyperlink ', df.loc[i, 'Question'], flags=re.MULTILINE) ques = re.sub(r'[^@\?a-zA-Z]', ' ', ques, flags=re.MULTILINE) ques = re.sub(r'[\?]', ' question ', ques, flags=re.MULTILINE) ques = re.sub(r'[@]', ' answer ', ques, flags=re.MULTILINE) ques = re.sub(r'[^\w]', ' ', ques, flags=re.MULTILINE) ques = ques.lower() ques = ques.split() wn = WordNetLemmatizer() temp = pos_tag(ques) ques = [ wn.lemmatize(word, tag_map[tag[0]]) for (word, tag) in temp if word not in new_stopwords ] ques = ' '.join(ques) df.loc[i, 'Question'] = ques.lower().strip() except: print(i) return df