Esempio n. 1
0
def prepare_text(x):

    review = re.sub('[^a-zA-Z]', ' ',
                    str(x))  # removing sepcial characters and numbers
    review = review.lower()  # lowering the text
    review = review.split()
    # removing stopwords and lemmatization

    review = [
        wn.lemmatize(word) for word in review
        if not word in set(stopwords.words('english'))
    ]
    review = ' '.join(review)

    MAX_NB_WORDS = 60000
    # Max number of words in each news.
    MAX_SEQUENCE_LENGTH = 500
    EMBEDDING_DIM = 100

    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts([review])
    word_index = tokenizer.word_index
    #print(f'Found {len(word_index)} unique tokens.')

    X = tokenizer.texts_to_sequences([review])
    X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

    return X
    def lemmatize(self):
        """Lemmatize all lists of tokens in self.processed.
        """

        if not self.processed:
            self.processed = self.title_list

        # tokenize the sentence and find the POS tag for each token
        for i in range(len(self.processed)):
            lang = LANG_MAP[self.selection_langs[i]]
            lemmatized = []
            if lang == "english":
                wordnet = self.lemmatizers[lang]
                nltk_tagged = nltk.pos_tag(self.processed[i])
                # tuple of (token, wordnet_tag)
                wordnet_tagged = map(
                    lambda x: (x[0], self._nltkTag2WordnetTag(x[1])),
                    nltk_tagged)
                for word, tag in wordnet_tagged:
                    if tag is None:
                        # if there is no available tag, append the token as is
                        lemmatized.append(word.lower())
                    else:
                        # else use the tag to lemmatize the token
                        lemmatized.append(wordnet.lemmatize(word, tag).lower())

            if lang == "german":
                hanta = self.lemmatizers[lang]
                tups_taglemmapos = hanta.tag_sent(self.processed[i])
                lemmatized = [tup[1].lower() for tup in tups_taglemmapos]

            self.processed[i] = lemmatized
def lemmatize_text(text):
    pos_tagged_text = pos_tag_text(text)

    if type(text) == list:
        lemmatized_tokens = [[
            wn.lemmatize(word, pos_tag) if pos_tag else word
            for word, pos_tag in t
        ] for t in pos_tagged_text]
        lemmatized_text = [' '.join(tok) for tok in lemmatized_tokens]
    else:
        lemmatized_tokens = [
            wn.lemmatize(word, pos_tag) if pos_tag else word
            for word, pos_tag in pos_tagged_text
        ]
        lemmatized_text = ' '.join(lemmatized_tokens)

    return lemmatized_text
Esempio n. 4
0
def lem_words(document):
    wordnet = WordNetLemmatizer()
    lemmatized_document = []

    for word in document:
        check_word = word[0].lower()
        tag = get_wordnet(word[1])
        if tag != 'Remove':
            lemmatized_document.append(str(wordnet.lemmatize(check_word, tag)))
    return lemmatized_document
Esempio n. 5
0
def lemmatizeWithTags(lines):
	processedLines = []
	count = 0
	for line in lines:
		words = tag(line)
		toWrite = ""
		for word in words:
			toWrite = toWrite + wn.lemmatize(word[0], penn_to_wn(word[1])) + " "
			# print('This is the original word: ' + word[0])
			# print('This is the category: ' + word[1])
			# print('And this is the lemmatized word: ' + wn.lemmatize(word[0], penn_to_wn(word[1])))
		# Remove trailing whitespace
		processedLines.append(toWrite.rstrip())
	return processedLines
Esempio n. 6
0
def basic_text_cleaning(line_from_column):
    # This function takes in a string, not a list or an array for the arg line_from_column

    tokenized_doc = word_tokenize(line_from_column)

    new_review = []
    for token in tokenized_doc:
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            new_review.append(new_token)

    new_term_vector = []
    for word in new_review:
        if not word in stopwords.words('english'):
            new_term_vector.append(word)

    final_doc = []
    for word in new_term_vector:
        final_doc.append(wordnet.lemmatize(word))

    return ' '.join(final_doc)
def clean(df):
    for i in range(0, df.shape[0]):
        try:
            ques = re.sub(r'^https?:\/\/.*[\r\n]*',
                          ' hyperlink ',
                          df.loc[i, 'Question'],
                          flags=re.MULTILINE)
            ques = re.sub(r'[^@\?a-zA-Z]', ' ', ques, flags=re.MULTILINE)
            ques = re.sub(r'[\?]', ' question ', ques, flags=re.MULTILINE)
            ques = re.sub(r'[@]', ' answer ', ques, flags=re.MULTILINE)
            ques = re.sub(r'[^\w]', ' ', ques, flags=re.MULTILINE)
            ques = ques.lower()
            ques = ques.split()
            wn = WordNetLemmatizer()
            temp = pos_tag(ques)
            ques = [
                wn.lemmatize(word, tag_map[tag[0]]) for (word, tag) in temp
                if word not in new_stopwords
            ]
            ques = ' '.join(ques)
            df.loc[i, 'Question'] = ques.lower().strip()
        except:
            print(i)
    return df