Esempio n. 1
0
def preprocessQuery(df):
    nlp = NLP()
    df['query'] = df['query'].apply(nlp.process)
    df['query'] = df['query'].str.lower()
    # df['query'] = df.query.apply(stemming)
    text = df.to_string(index=False, header=False)
    tokens = []
    tokens = stemming(text)
    return tokens
def preprocess(df):
    for index, row in df.iterrows():
        if (row['artist-name'] != None):
            row['body'] = row['body'].replace(row['artist-name'], '')
        row['body'] = row['body'].replace(row['track-name'], '')
    nlp = NLP()
    df['body'] = df['body'].apply(nlp.process)
    df['body'] = df['body'].str.lower()
    df['body'] = df.body.apply(stemming)
    df['lyrics'] = df['lyrics'].apply(nlp.process)
    df['lyrics'] = df['lyrics'].str.lower()
    df['lyrics'] = df.lyrics.apply(stemming)
    return df
Esempio n. 3
0
def one_string_stop_words(sentence):
    '''
    Argument:
        string of words
    return:
        remove stop words from this string like this, did
        but other words like not, no dont remove
	'''
    stop_words = NLP().stopword_list
    sentence = sentence.split(' ')
    updated_sentence = ''
    for word in sentence:
    	if word not in stop_words:
    		updated_sentence += word + ' '
    return updated_sentence
Esempio n. 4
0
def cleaner(line):

    # Removes RT, url and trailing white spaces
    line = re.sub(r'^RT ', '', re.sub(r'https://t.co/\w+', '', line).strip())

    # Removes puctuation
    punctuation = re.compile("[.;:!\'’‘“”?,\"()\[\]]")
    tweet = punctuation.sub("", line.lower())

    # Removes stopwords
    nlp_for_stopwords = NLP(replace_words=True,
                            remove_stopwords=True,
                            remove_numbers=True,
                            remove_punctuations=False)
    tweet = nlp_for_stopwords.process(
        tweet
    )  # This will remove stops words that are not necessary. The idea is to keep words like [is, not, was]
    # https://towardsdatascience.com/why-you-should-avoid-removing-stopwords-aa7a353d2a52

    # tokenisation
    # We used the split method instead of the word_tokenise library because our tweet is already clean at this point
    # and the twitter data is not complicated
    tweet = tweet.split()

    # POS
    pos = pos_tag(tweet)

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tweet = ' '.join([
        lemmatizer.lemmatize(word, po[0].lower())
        if po[0].lower() in ['n', 'r', 'v', 'a'] else word for word, po in pos
    ])
    # tweet = ' '.join([lemmatizer.lemmatize(word, 'v') for word in tweet])

    return tweet
def one_string_stop_words(sentence, language):
    '''
    Argument:
        string of words
    return:
        remove stop words from this string like this, did
        but other words like not, no dont remove
    '''
    if language == 'English' or language == 'english':
        stop_words = NLP().stopword_list # retrive stopwords list
        sentence = sentence.split(' ')
        updated_sentence = ''
        for word in sentence:
            if word not in stop_words:
                updated_sentence += word + ' '
    
    elif language == 'Arabic' or language == 'arabic':

        file_dir1 =  'sentiment_behind_reviews/ml_work/stop_words/nltk_stop_words_handle.txt'
        file_di2 = 'sentiment_behind_reviews/ml_work/stop_words/stop_list1.txt'
        file_di3 ='sentiment_behind_reviews/ml_work/stop_words/updated_stop_words.txt'

        stop_words_designed = []
        stop_words_designed.extend(convert_file_of_stop_words_to_list(file_di2))
        
        stop_words_designed = set(stop_words_designed)
        stop_words_designed = list(stop_words_designed)
        arabic_stop_words_designed = convert_file_of_stop_words_to_list(file_di3)

        stop_words = arabic_stop_words_designed 
        sentence = sentence.split(' ')
        updated_sentence = ''
        for word in sentence:
            if word not in stop_words:
                updated_sentence += word + ' '
    return updated_sentence
def one_string_stop_words(sentence, language):
    '''
    Argument:
        string of words
    return:
        remove stop words from this string like this, did
        but other words like not, no dont remove
    '''
    if language == 'English' or language == 'english':
        stop_words = NLP().stopword_list  # retrive stopwords list
        sentence = sentence.split(' ')
        updated_sentence = ''
        for word in sentence:
            if word not in stop_words:
                updated_sentence += word + ' '

    elif language == 'Arabic' or language == 'arabic':
        stop_words = arabic_stop_words_designed
        sentence = sentence.split(' ')
        updated_sentence = ''
        for word in sentence:
            if word not in stop_words:
                updated_sentence += word + ' '
    return updated_sentence
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nlppreprocess import NLP

#***reading raw data generated by twint scraper***
data = pd.read_csv(
    'Processed_data_input_to_sentiment.csv',
    encoding='latin1',
    usecols=["date", "time", "username", "name", "tweet", "likes_count"])

pos = []
neg = []
neu = []
compound = []

nlp = NLP()
data['tweet'] = data['tweet'].apply(nlp.process)
data.to_csv("Temp_StopWords_Removed.csv")  # just to check

#***sentiment analysis using vader sentiment***
sid_obj = SentimentIntensityAnalyzer()
for ind in data.index:
    snt = sid_obj.polarity_scores(data['tweet'][ind])
    pos.append(snt['pos'])
    neg.append(snt['neg'])
    neu.append(snt['neu'])
    compound.append(snt['compound'])
    date = data['date'][ind]

    # dataframe loads date as string , so convert string to date , as it is easier to groupby
    day = date[0:2]
Esempio n. 8
0
import time
import numpy
'''import nltk
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

import numpy
import tflearn
import tensorflow
import random
import json
import pickle
import os'''
#socketio

obj = NLP()
nlp = pipeline('question-answering')
app = Flask(__name__)
#run_with_ngrok(app)
app.config['SECRET_KEY'] = 'secretkey123'
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///database/db.sqlite3'
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
bootstrap = Bootstrap(app)
db = SQLAlchemy(app)
login_manager = LoginManager()
login_manager.init_app(app)
login_manager.login_view = 'login'

#socketio
socketio = SocketIO(app)
welcome_greetings = [
Esempio n. 9
0
from nltk.tokenize import RegexpTokenizer
from nlppreprocess import NLP
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import gc

gc.disable()

tokenizer = RegexpTokenizer("[a-zA-Z]+", )
stopwords = NLP()  #for stopwords removal
sb = SnowballStemmer('english')
lm = WordNetLemmatizer()


def get_preocessed_data(data, key='stemming'):
    """This function is a pipeline to exatract relevant 
    and useful data for our further operation.
    
    Here we will process our data with 3 important process.
    1. Tokenization
    2. Stopwords Removal
    3. Stemming or Lemmatization (as per your requirement)"""

    processed_data = []

    for text in data:

        text = text.lower()
        text = ' '.join(i for i in tokenizer.tokenize(text))
        text = (stopwords.process(text)).split()