def preprocessQuery(df): nlp = NLP() df['query'] = df['query'].apply(nlp.process) df['query'] = df['query'].str.lower() # df['query'] = df.query.apply(stemming) text = df.to_string(index=False, header=False) tokens = [] tokens = stemming(text) return tokens
def preprocess(df): for index, row in df.iterrows(): if (row['artist-name'] != None): row['body'] = row['body'].replace(row['artist-name'], '') row['body'] = row['body'].replace(row['track-name'], '') nlp = NLP() df['body'] = df['body'].apply(nlp.process) df['body'] = df['body'].str.lower() df['body'] = df.body.apply(stemming) df['lyrics'] = df['lyrics'].apply(nlp.process) df['lyrics'] = df['lyrics'].str.lower() df['lyrics'] = df.lyrics.apply(stemming) return df
def one_string_stop_words(sentence): ''' Argument: string of words return: remove stop words from this string like this, did but other words like not, no dont remove ''' stop_words = NLP().stopword_list sentence = sentence.split(' ') updated_sentence = '' for word in sentence: if word not in stop_words: updated_sentence += word + ' ' return updated_sentence
def cleaner(line): # Removes RT, url and trailing white spaces line = re.sub(r'^RT ', '', re.sub(r'https://t.co/\w+', '', line).strip()) # Removes puctuation punctuation = re.compile("[.;:!\'’‘“”?,\"()\[\]]") tweet = punctuation.sub("", line.lower()) # Removes stopwords nlp_for_stopwords = NLP(replace_words=True, remove_stopwords=True, remove_numbers=True, remove_punctuations=False) tweet = nlp_for_stopwords.process( tweet ) # This will remove stops words that are not necessary. The idea is to keep words like [is, not, was] # https://towardsdatascience.com/why-you-should-avoid-removing-stopwords-aa7a353d2a52 # tokenisation # We used the split method instead of the word_tokenise library because our tweet is already clean at this point # and the twitter data is not complicated tweet = tweet.split() # POS pos = pos_tag(tweet) # Lemmatization lemmatizer = WordNetLemmatizer() tweet = ' '.join([ lemmatizer.lemmatize(word, po[0].lower()) if po[0].lower() in ['n', 'r', 'v', 'a'] else word for word, po in pos ]) # tweet = ' '.join([lemmatizer.lemmatize(word, 'v') for word in tweet]) return tweet
def one_string_stop_words(sentence, language): ''' Argument: string of words return: remove stop words from this string like this, did but other words like not, no dont remove ''' if language == 'English' or language == 'english': stop_words = NLP().stopword_list # retrive stopwords list sentence = sentence.split(' ') updated_sentence = '' for word in sentence: if word not in stop_words: updated_sentence += word + ' ' elif language == 'Arabic' or language == 'arabic': file_dir1 = 'sentiment_behind_reviews/ml_work/stop_words/nltk_stop_words_handle.txt' file_di2 = 'sentiment_behind_reviews/ml_work/stop_words/stop_list1.txt' file_di3 ='sentiment_behind_reviews/ml_work/stop_words/updated_stop_words.txt' stop_words_designed = [] stop_words_designed.extend(convert_file_of_stop_words_to_list(file_di2)) stop_words_designed = set(stop_words_designed) stop_words_designed = list(stop_words_designed) arabic_stop_words_designed = convert_file_of_stop_words_to_list(file_di3) stop_words = arabic_stop_words_designed sentence = sentence.split(' ') updated_sentence = '' for word in sentence: if word not in stop_words: updated_sentence += word + ' ' return updated_sentence
def one_string_stop_words(sentence, language): ''' Argument: string of words return: remove stop words from this string like this, did but other words like not, no dont remove ''' if language == 'English' or language == 'english': stop_words = NLP().stopword_list # retrive stopwords list sentence = sentence.split(' ') updated_sentence = '' for word in sentence: if word not in stop_words: updated_sentence += word + ' ' elif language == 'Arabic' or language == 'arabic': stop_words = arabic_stop_words_designed sentence = sentence.split(' ') updated_sentence = '' for word in sentence: if word not in stop_words: updated_sentence += word + ' ' return updated_sentence
import pandas as pd from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer from nlppreprocess import NLP #***reading raw data generated by twint scraper*** data = pd.read_csv( 'Processed_data_input_to_sentiment.csv', encoding='latin1', usecols=["date", "time", "username", "name", "tweet", "likes_count"]) pos = [] neg = [] neu = [] compound = [] nlp = NLP() data['tweet'] = data['tweet'].apply(nlp.process) data.to_csv("Temp_StopWords_Removed.csv") # just to check #***sentiment analysis using vader sentiment*** sid_obj = SentimentIntensityAnalyzer() for ind in data.index: snt = sid_obj.polarity_scores(data['tweet'][ind]) pos.append(snt['pos']) neg.append(snt['neg']) neu.append(snt['neu']) compound.append(snt['compound']) date = data['date'][ind] # dataframe loads date as string , so convert string to date , as it is easier to groupby day = date[0:2]
import time import numpy '''import nltk from nltk.stem.lancaster import LancasterStemmer stemmer = LancasterStemmer() import numpy import tflearn import tensorflow import random import json import pickle import os''' #socketio obj = NLP() nlp = pipeline('question-answering') app = Flask(__name__) #run_with_ngrok(app) app.config['SECRET_KEY'] = 'secretkey123' app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///database/db.sqlite3' app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False bootstrap = Bootstrap(app) db = SQLAlchemy(app) login_manager = LoginManager() login_manager.init_app(app) login_manager.login_view = 'login' #socketio socketio = SocketIO(app) welcome_greetings = [
from nltk.tokenize import RegexpTokenizer from nlppreprocess import NLP from nltk.stem.snowball import SnowballStemmer from nltk.stem import WordNetLemmatizer import gc gc.disable() tokenizer = RegexpTokenizer("[a-zA-Z]+", ) stopwords = NLP() #for stopwords removal sb = SnowballStemmer('english') lm = WordNetLemmatizer() def get_preocessed_data(data, key='stemming'): """This function is a pipeline to exatract relevant and useful data for our further operation. Here we will process our data with 3 important process. 1. Tokenization 2. Stopwords Removal 3. Stemming or Lemmatization (as per your requirement)""" processed_data = [] for text in data: text = text.lower() text = ' '.join(i for i in tokenizer.tokenize(text)) text = (stopwords.process(text)).split()