Esempio n. 1
0
def preprocessing(tagged_by_Sentence):
    """
    1. 특수문자 제거, 소문자
    2. not, "n't" -> not_stemming(다음단어)
    3. 특수문자 제거, 숫자 제거
    4. stopword 제거
    5. stemming
    """
    from nltk.corpus import stopwords
    stopwords = stopwords.words('english')
    stopwords.remove("not")
    stopwords.remove('very')
    stopwords.append("'m")
    stopwords.append("'s")

    re_special = re.compile('[^A-Za-z0-9]+')  # 문자,숫자 제외한 나머지
    re_num = re.compile('[0-9]+')  # 숫자
    st = PorterStemmer()
    new_sent = []
    not_indice = []

    for sent in tagged_by_Sentence:
        text = [(tup[0].lower(), tup[1]) for tup in sent
                if not bool(re_special.match(tup[0]))]  # 1. 특수문자 제거, 소문자

        # 2. not, n't 랑 다음단어 합치기
        # not, n't 가 나오면 다음 단어랑 합치고, 그 다음 단어의 index를 저장해놨다가 del_element_by_indice 함수에서 제거
        new_text = []
        for index, tup in enumerate(text):
            if tup[0] == "n't" or tup[0] == "not":
                if index + 1 < len(text):
                    if not bool(re_special.match(
                            text[index + 1][0])) or text[index + 1][1] != 'CD':
                        new_text.append("not_" + st.stem(text[index + 1][0]))
                        not_indice.append(index)
                else:
                    new_text.append("not")
            else:
                if not bool(re_num.match(
                        tup[0])) or tup[1] != 'CD':  # 3. 특수문자, 숫자 제거
                    new_text.append(tup[0])
        new_text = del_element_by_indice(new_text, not_indice)

        new_words = [
            st.stem(word) for word in new_text if word not in stopwords
        ]  # 4,5 stopword 제거, stemming
        new_sent.append(new_words)
    return new_sent
Esempio n. 2
0
 def __filtering_sastrawi(self, documents):
     stop_factory = StopWordRemoverFactory().get_stop_words()
     list_stop = stop_factory + self.stop_more
     dictionary = ArrayDictionary(list_stop)
     stopwords = StopWordRemover(dictionary)
     stop = stopwords.remove(documents)
     return stop
Esempio n. 3
0
 def removeStopWords(sentences, stopwords=None):
     '''
     :param sentences: list of sentences
     :param stopwords: list of stopwords
     :return:list of sentences without stopwords
     '''
     if stopwords == None:
         from nltk.corpus import stopwords
         stopwords = stopwords.words('english')
         stopwords.remove('most')
     sentences1 = []
     for sent in sentences:
         newsent = ''
         for word in word_tokenize(sent):
             if word not in stopwords:
                 newsent = newsent + ' ' + word
         sentences1.append(newsent)
     return sentences1
 def removeStopWords(sentences,stopwords=None):
     '''
     :param sentences: list of sentences
     :param stopwords: list of stopwords
     :return:list of sentences without stopwords
     '''
     if stopwords==None:
         from nltk.corpus import stopwords
         stopwords=stopwords.words('english')
         stopwords.remove('most')
     sentences1=[]
     for sent in sentences:
         newsent=''
         for word in word_tokenize(sent):
             if word not in stopwords:
                 newsent = newsent+' '+word
         sentences1.append(newsent)
     return sentences1
Esempio n. 5
0
 def stopwords_e_pontuacao(self, instancia):
     ## tokenizar com nltk
     instancia = instancia.split()
     ### remove punctuation from each word
     table = str.maketrans('', '', string.punctuation)
     instancia = [w.translate(table) for w in instancia]
     ### convert to lower case and  remove everything that is not alphabetic
     instancia = [word for word in instancia if word.isalpha()]
     ## filter out StopWords
     stopwords = nltk.corpus.stopwords.words('portuguese') + [
         'aqui', 'a', 'rs', 'é', '/', 'fdp', '%', 'pfvr', 'cadê', 'né', 'q',
         'pq', '#', '@', 'mt', 'youtube', 'hj', 'dnv', 'mto', 'vc', 'eh',
         'r$', 'rt', 'via', 'vía'
     ]
     stopwords.remove("não")
     instancia = [w for w in instancia if not w in stopwords]
     ## detokenizer (necessary to pass as arg to make an textblob object)
     with MosesDetokenizer('pt') as detokenize:
         instancia = detokenize(instancia)
     return instancia
    def update_stopwords(self,
                         add_words=[],
                         remove_words=[],
                         update_corpus=True):
        stopwords = self.stopwords
        [stopwords.append(x) for x in add_words]

        [stopwords.remove(x) for x in remove_words if x in stopwords]

        self._stopwords_ = stopwords
        if update_corpus:
            self.prepare_corpus()
Esempio n. 7
0
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

nltk.download('punkt')
nltk.download('stopwords')

punctuation = list(string.punctuation)
stopwords = stopwords.words('english')
http_link = 'https://'

# remove negation words from stopwords
neg_words = ['no', 'nor', 'not', 'wasn', 'weren']
for word in neg_words:
    stopwords.remove(word)


def process(tweet):
    tweet = reduce_lengthening(tweet)
    token_list = tokenize(tweet)
    processed_token_list = process_token(token_list)
    stem_token_list = stemming(processed_token_list)
    return stem_token_list


def tokenize(tweet):
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True)
    token_list = tokenizer.tokenize(tweet)
    return token_list
Esempio n. 8
0
import re

from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, LSTM, Bidirectional

from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

stopwords = stopwords.words('english')
newStopWords = ['', ' ', '  ', '   ', '    ', ' s']
stopwords.extend(newStopWords)
stopwords.remove('no')
stopwords.remove('not')
stopwords.remove('very')
stop_words = set(stopwords)


def clean_doc(doc, vocab=None):
    tokens = word_tokenize(doc)
    tokens = [re.sub('[^a-zA-Z]', ' ', word) for word in tokens]
    tokens = [word.lower() for word in tokens]
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [word for word in tokens if len(word) > 1]
    if vocab:
        tokens = [w for w in tokens if w in vocab]
        tokens = ' '.join(tokens)
    return tokens
posneg_feature_vectors_test=[]
posneg_feature_vectors_train = []
posneg_feature_vectors_test = []
full_data=[]
label_vector = []
pos = dict()
neg = dict()
posneg = dict()
part_of_speech=[]
#global_index = 0
set_size= 8000
top_k_features = 200
end_index=0
count=0
stopwords = nltk.corpus.stopwords.words('english')
stopwords.remove('not')
train_size=0.9*set_size


def _read_data(file_name):
    """

    :rtype : object
    """

    row_cnt=-1;
    with open(file_name, 'rb') as tsvin:
        tsvin = csv.reader(tsvin, delimiter='\t')

        index = 0
        for row in tsvin:
#Saving the np array into a text file
np.savetxt('train_p.txt', p, delimiter=' ', fmt='%s',encoding="utf-8")
np.savetxt('train_n.txt', n, delimiter=' ', fmt='%s',encoding="utf-8")


# reading the text files and removing the Stop Words:
d = path.dirname('.')

textp_w = open(path.join(d, 'train_p.txt'),encoding='utf-8').read()
textn_w = open(path.join(d, 'train_n.txt'),encoding='utf-8').read()
stopwords = set(STOPWORDS)
stopwords.add("said")
stopwords.add("br")
stopwords.add(" ")
stopwords.remove("not")

stopwords.remove("no")
#stopwords.remove("good")
#stopwords.remove("love")
stopwords.remove("like")
#stopwords.remove("best")
#stopwords.remove("!")
print ("Total number of words in duplicate pair questions :",len(textp_w))
print ("Total number of words in non duplicate pair questions :",len(textn_w))


wc = WordCloud(background_color="white", max_words=len(textp_w), stopwords=stopwords)
wc.generate(textp_w)
print ("Word Cloud for Duplicate Question pairs")
plt.imshow(wc, interpolation='bilinear')
Esempio n. 11
0
import tensorflow as tf
import tensorflow_hub as hub

from model.utils import embedding_metric, Tokenizer, detokenize
from torchMoji.api.botmoji import Botmoji
from inferSent.api.botsent import Botsent
from Toxicity.toxic import NBLogisticRegression, NBTfidfVectorizer, tokenize

EPSILON = np.finfo(np.float32).eps
ROOT_DIR = os.path.dirname(
    os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

tokenizer = Tokenizer('spacy')
stopwords = stopwords.words('english')
question_words = {'who', 'what', 'why', 'where', 'how', 'when'}
_ = [stopwords.remove(q) for q in question_words]
punct = list(string.punctuation)
contractions = ["'s", "'d", "'ld", "n't", "'re", "'ll", "'ve"]
filters = set(stopwords + contractions + punct)


def _get_emojis():
    # All emojis in the order returned by deepmoji
    EMOJIS = ":joy: :unamused: :weary: :sob: :heart_eyes: :pensive: " + \
             ":ok_hand: :blush: :heart: :smirk: :grin: :notes: :flushed: " + \
             ":100: :sleeping: :relieved: :relaxed: :raised_hands: " + \
             ":two_hearts: :expressionless: :sweat_smile: :pray: " + \
             ":confused: :kissing_heart: :heartbeat: :neutral_face: " + \
             ":information_desk_person: :disappointed: :see_no_evil: " + \
             ":tired_face: :v: :sunglasses: :rage: :thumbsup: :cry: " + \
             ":sleepy: :yum: :triumph: :hand: :mask: :clap: :eyes: :gun: " + \
Esempio n. 12
0
    val["ld"] = str(link_data)
    val["wn"] = str(workflow_name)
    val["un"] = str(user_data)
    val["stu"] = start_time_user
    resp = jsonify(val)
    resp.headers['Access-Control-Allow-Origin'] = '*'
    return resp


from nltk.corpus import stopwords

stopwords = list(stopwords.words('english'))
stopwords.extend(list(string.punctuation))
stopwords.append("i\'ve")
stopwords.append("i\'m")
stopwords.remove("no")
stopwords.remove("not")
stopwords.remove("than")
stopwords.remove("which")
stopwords.remove("or")


def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join(
        [word for word in str(text).split() if word not in stopwords])


@app.route('/uploader', methods=['GET', 'POST'])
def upload_file():
    if request.method == 'POST':
list_7up_cocacola=['7 Up','7 UP','7 up','7 uP','7-Up','7-UP','7.up','7-up','7-uP','7up','7UP','7Up','7uP','coca cola','COCA COLA','coca-cola','COCACOLA']

cola_list=['cola','coca','coke']

softdrink_list=['pepsi','mirinda','cocacola','thumbs up','coca-cola','7up','mirinda','sprite','fanta','limca','twister']

prep=['with','in','over','by','above','at','from','on','about']

list_bbq=['b b q','b l t']

quantity=['lb','oz','ozs','lbs''plate','seasonal','little','per','v.','big','small','large','medium','can','cans','per','l','L','glass','ly','ml','litre','gram','grams','gm','gms','kg','kgs','cl','pcs','pieces','piece','bottle','bottles','large','medium','med','small','inch','inches','g']

phrase=['small bottle of','small bottles of','large bottle of','large bottles of','per glass','per bottle','big bowl of','bowl of','bottle of','bottles of']

stopwords = list(set(stopwords.words('english')))
stopwords.remove('and')
stopwords.extend(['light','extra','addon','add-on','extras','spare','day','spare','double','alacarte','regular','fresh','homemade','bowl','plate','little','hs','HS','add','cup','however','often','widest','special','children','review','reviews','authentic'])
# print stopwords
remove_phrase=['freshly brewed','ala carte','add on']
menu_phrase_remove=[]

def punct(menu,punctuations):
	for char in menu:
		if char in punctuations:
			menu=menu.replace(char,' ')
	menu=' '.join(menu.split())	
	return menu


outputfile = open("output.csv","wb")
writer = csv.writer(outputfile)
Esempio n. 14
0
import re
import emoji
import nltk
#--------------------------------------------------------------------------------#
from flask import Flask, request, render_template
from flask_restful import Api, Resource
from textblob import TextBlob
#--------------------------------------------------------------------------------#
from nltk.corpus import sentiwordnet as swn
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
#--------------------------------------------------------------------------------#
from nltk.corpus import stopwords

stopwords = stopwords.words('english')
stopwords.remove("not")
stopwords.remove("no")
stopwords.remove("nor")
stopwords.remove("above")
#--------------------------------------------------------------------------------#
lemma = WordNetLemmatizer()

# Do this first, that'll do something eval()
# to "materialize" the LazyCorpusLoader
next(swn.all_senti_synsets())
#--------------------------------------------------------------------------------#

pattern = '@\S+|https?:\S+|http?:\S|[^A-Za-z]+|com|net'
urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
userPattern = '@[^\s]+'
alphaPattern = "[^a-zA-Z0-9]"
Esempio n. 15
0
from nltk.corpus import stopwords

from lesk import simple_lesk, original_lesk
from similarity import max_similarity
from utils import lemmatize, lemmatize_sentence
"""
This is a module for all-words full text WSD
(modified for use in tropical_models framework)

This would involve:
Step 1: First tokenize your text such that each token is separated by whitespace
Step 2: Iterates through the tokens and only disambiguate the content words.
"""

stopwords = stopwords.words('english') + list(punctuation)
stopwords.remove('is')
stopwords.remove('are')
stopwords.remove('was')
stopwords.remove('had')
stopwords.remove('being')
stopwords.remove('were')
stopwords.remove('been')
stopwords.remove('has')
stopwords.remove('be')


def disambiguate(sentence,
                 algorithm=simple_lesk,
                 context_is_lemmatized=False,
                 similarity_option='path',
                 keepLemmas=False,
Esempio n. 16
0
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

df = pd.read_csv("train.csv")
nltk.download('stopwords')

i = nltk.corpus.stopwords.words('english')

stopwords = set(i)

stopwords.remove("not")
stopwords.remove("against")
stopwords.remove("no")


def preprocess(x):

    if (type(x) == str):
        x = re.sub('[^a-z\s]', '', x.lower())
        x = re.sub(r'[^\w\s]', "", x)
        x = [w for w in x.split() if w not in set(stopwords)]
        return x


g = []
for i in range(0, len(df)):
    y = preprocess(df['text'][i])
Esempio n. 17
0
from nltk.corpus import stopwords
from src.helpers.debug import top_keys
import re

stopwords = set(stopwords.words('english'))
stopwords.remove('don')
stopwords.remove('will')


# filter out token
def valid_tkn(tkn, valid_kw, invalid_kw):
    tkn = tkn.lower()
    if tkn in valid_kw:
        return True

    if tkn in invalid_kw:
        return False
    # stopwords
    if tkn in stopwords:
        return False

    # ampersand and twitter link
    twitter_stop = ['&amp;', 'rt', 'http']
    if '//t.co/' in tkn or tkn in twitter_stop:
        return False

    # special unicode character
    if any(ord(c) > 128 for c in tkn):
        return False

    regex = re.compile('[^a-zA-Z]')
# import tensorflow_datasets as tfds
import nltk
import pandas as pd
from nltk.corpus import stopwords

nltk.download('stopwords')

stopwords = stopwords.words('english')
stopwords.remove('not')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)

# df_gen_1 = pd.read_csv('forum_content.csv', names=["link", "content", "label"])
# df_gen_2 = pd.read_csv('forum_content_gen.csv', names=["link", "content", "label"])
# df_am_1 = pd.read_csv('forum_content_literature_am.csv', names=["link", "content", "label"])
# df_am_2 = pd.read_csv('forum_content_am.csv', names=["link", "content", "label"])
#
# df_gen_1['forum_type'] = "GENERAL_FORUM"
# df_gen_2['forum_type'] = "GENERAL_FORUM"
# df_am_1['forum_type'] = "AFRICAN_AMERICAN_GENERAL_FORUM"
# df_am_2['forum_type'] = "AFRICAN_AMERICAN_GENERAL_FORUM"

df_af_am_forum = pd.read_csv('content_am_new_debug.csv',
                             names=[
                                 "member_no", "content", "threadUrl", "title",
                                 "postDate", "is_am", "author"
                             ])
# df_wm_forum = pd.read_csv('content_gen.csv', names=["member_no","content","threadUrl","title","postDate","is_am","author"])
def preprocess_text(df, columnname):
    import warnings
    import nltk
    from nltk import FreqDist
    nltk.download('punkt')
    import pandas as pd
    nltk.download('stopwords')
    warnings.filterwarnings("ignore")
    import re
    df = df.drop_duplicates(subset=columnname)
    print(df.shape)
    df[columnname] = df[columnname].map(
        lambda x: re.sub(r'http\S+', '', str(x)))
    df[columnname] = df[columnname].map(
        lambda x: re.sub(r'[^ a-zA-Z0-9!?:,.\'=]', '', str(x)))
    df[columnname] = df[columnname].str.lower()
    # Expand contractions
    import re
    contractions_dict = {
        'didn\'t': 'did not',
        'don\'t': 'do not',
        "aren't": "are not",
        "can't": "cannot",
        "cant": "cannot",
        "can't've": "cannot have",
        "'cause": "because",
        "could've": "could have",
        "couldn't": "could not",
        "couldn't've": "could not have",
        "didn't": "did not",
        "didnt": "did not",
        "doesn't": "does not",
        "doesnt": "does not",
        "don't": "do not",
        "dont": "do not",
        "hadn't": "had not",
        "hadn't've": "had not have",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he had",
        "he'd've": "he would have",
        "he'll": "he will",
        "he's": "he is",
        "how'd": "how did",
        "how'd'y": "how do you",
        "how'll": "how will",
        "how's": "how is",
        "i'd": "i had",
        "i'd've": "i would have",
        "i'll": "i will",
        "i'm": "i am",
        "im": "i am",
        "i've": "i have",
        "isn't": "is not",
        "it'll": "it will",
        "it's": "it is",
        "let's": "let us",
        "ma'am": "madam",
        "mayn't": "may not",
        "might've": "might have",
        "mightn't": "might not",
        "must've": "must have",
        "mustn't": "must not",
        "mustn't've": "must not have",
        "needn't": "need not",
        "needn't've": "need not have",
        "oughtn't": "ought not",
        "oughtn't've": "ought not have",
        "shan't": "shall not",
        "sha'n't": "shall not",
        "shan't've": "shall not have",
        "she'd": "she had",
        "she'd've": "she would have",
        "she'll": "she will",
        "she's": "she is",
        "should've": "should have",
        "shouldn't": "should not",
        "shouldn't've": "should not have",
        "that's": "that is",
        "there's": "there is",
        "they'd": "they had",
        "they'd've": "they would have",
        "they'll": "they will",
        "they're": "they are",
        "they've": "they have",
        "to've": "to have",
        "wasn't": "was not",
        "we'd": "we had",
        "we'd've": "we would have",
        "we'll": "we will",
        "we'll've": "we will have",
        "we're": "we are",
        "we've": "we have",
        "weren't": "were not",
        "what're": "what are",
        "what's": "what is",
        "what've": "what have",
        "when've": "when have",
        "where'd": "where did",
        "where's": "where is",
        "where've": "where have",
        "who'll": "who will",
        "who's": "who is",
        "will've": "will have",
        "won't": "will not",
        "won't've": "will not have",
        "would've": "would have",
        "wouldn't": "would not",
        "wouldn't've": "would not have",
        "y'all": "you all",
        "you'll": "you will",
        "you're": "you are",
        "you've": "you have"
    }

    contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))

    def expand_contractions(s, contractions_dict=contractions_dict):
        def replace(match):
            return contractions_dict[match.group(0)]

        return contractions_re.sub(replace, s)

    df[columnname] = df[columnname].apply(expand_contractions)
    df = df.reset_index(drop=False)

    data = df[[columnname, 'index']]
    print(data.columns)
    data.rename(columns={'index': 'INDEX'}, inplace=True)
    from nltk.tokenize import sent_tokenize
    data['split'] = data[columnname].apply(sent_tokenize)
    data_split = data.set_index('INDEX').split.apply(
        pd.Series).stack().reset_index(level=0).rename(columns={0: columnname})
    data_split.reset_index(level=0, inplace=True)
    data_split.rename(columns={
        'INDEX': 'review_no',
        'index': 'sentence'
    },
                      inplace=True)
    # Spell Correct Algorithm
    from symspellpy.symspellpy import SymSpell  # import the module
    max_edit_distance_dictionary = 0
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    dictionary_path = "./frequency_dictionary_en_82_765.txt"
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    #if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
    sym_spell.load_dictionary(dictionary_path, term_index, count_index)
    data_split[columnname] = data_split[columnname].apply(
        sym_spell.word_segmentation)
    data_split[columnname] = data_split[columnname].apply(
        lambda x: x.corrected_string)
    from Sentiment_prediction_noor import Process_and_predict
    data_split = Process_and_predict(data_split, columnname)

    df.rename(columns={'index': 'review_no'}, inplace=True)

    #print(df.head())
    df.rename(columns={columnname: "Text"}, inplace=True)
    data_final = data_split.merge(df, on="review_no", how='left')
    print(data_final.columns)
    #print(data_final.head(2))
    # Natural Language Processing of Reviews (Top Features, Feelings, Actions)
    import spacy
    from spacy import displacy
    from collections import Counter
    import en_core_web_sm
    nlp = en_core_web_sm.load()
    from nltk.corpus import stopwords
    stopwords = stopwords.words('english')
    newStopWords = ['-PRON-']
    stopwords.extend(newStopWords)

    words2 = ['no', 'nor', 'not']
    for word in list(
            stopwords
    ):  # iterating on a copy since removing will mess things up
        if word in words2:
            stopwords.remove(word)
    # remove short words (length =< 0)
    final = data_final[columnname].apply(
        lambda x: ' '.join([w for w in x.split() if len(w) > 0]))
    final = pd.DataFrame(final)
    #tokenized_reviews = pd.Series(reviews).apply(lambda x: x.split())
    tokenized_reviews = pd.Series(final[columnname]).apply(lambda x: x.split())

    #def lemmatization(texts, tags=['NOUN', 'ADJ']):
    #def lemmatization(texts, tags=['NOUN', 'ADJ', 'VERB']):
    def lemmatization(texts, tags=['NOUN']):
        output = []
        for sent in texts:
            doc = nlp(" ".join(sent))
            output.append(
                [token.lemma_ for token in doc if token.pos_ in tags])
        return output

    def lemmatization_adj(texts, tags=['ADJ']):
        output = []
        for sent in texts:
            doc = nlp(" ".join(sent))
            output.append(
                [token.lemma_ for token in doc if token.pos_ in tags])
        return output

    def lemmatization_verb(texts, tags=['VERB']):
        output = []
        for sent in texts:
            doc = nlp(" ".join(sent))
            output.append(
                [token.lemma_ for token in doc if token.pos_ in tags])
        return output

    noun_adj_pairs = []
    noun_adj_sent = []
    import spacy
    for l in range(len(final)):
        doc = nlp(str(final[columnname][l]))
        noun_adj_pairs = []
        for i, token in enumerate(doc):
            #print(token)
            if token.pos_ not in ('NOUN', 'PROPN'):
                continue
            for j in range(i + 1, len(doc)):
                if doc[j].pos_ == 'ADJ':
                    noun_adj_pairs.append((token, doc[j]))
                    break
        noun_adj_sent.append(noun_adj_pairs)
    final['noun_adj'] = noun_adj_sent

    #Noun Extraction

    print("Beginning Noun Extraction")
    reviews_noun = lemmatization(tokenized_reviews)

    reviews_3 = []
    for i in range(len(reviews_noun)):
        reviews_3.append(' '.join(reviews_noun[i]))
    final['features'] = reviews_3

    print("Noun Extraction Complete")

    #Adjective Extraction

    print("Beginning Adjectives Extraction")
    reviews_adj = lemmatization_adj(tokenized_reviews)

    reviews_4 = []
    for i in range(len(reviews_adj)):
        reviews_4.append(' '.join(reviews_adj[i]))
    final['feelings'] = reviews_4

    print("Adjectives Extraction Complete")

    #Verb Extraction

    print("Beginning Verb Extraction")
    reviews_verb = lemmatization_verb(tokenized_reviews)

    reviews_5 = []
    for i in range(len(reviews_verb)):
        reviews_5.append(' '.join(reviews_verb[i]))
    final['action'] = reviews_5

    print("Verb Extraction Complete")

    # remove short words (length =< 3)
    final['features'] = final['features'].apply(
        lambda x: ' '.join([w for w in x.split() if len(w) > 2]))
    final['feelings'] = final['feelings'].apply(
        lambda x: ' '.join([w for w in x.split() if len(w) > 2]))
    final['action'] = final['action'].apply(
        lambda x: ' '.join([w for w in x.split() if len(w) > 2]))

    #final['adverb']   = final['adverb'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

    # Remove Stopwords and Spaces (Start and End)
    def remove_stopwords(text):
        text = ' '.join(
            [word for word in text.split() if word not in stopwords])
        return text.strip()

    final['features'] = final['features'].apply(remove_stopwords)
    final['feelings'] = final['feelings'].apply(remove_stopwords)
    final['action'] = final['action'].apply(remove_stopwords)

    final = final.drop([columnname], axis=1)

    def freq_words(x):
        all_words = ' '.join([text for text in x])
        all_words = all_words.split()
        #print(all_words)
        fdist = FreqDist(all_words)
        words_df = pd.DataFrame({
            'word': list(fdist.keys()),
            'count': list(fdist.values())
        })
        # selecting top 20 most frequent words
        #d = words_df.nlargest(columns="count", n = length)

        # take words that covers up to 80th quantile
        length = int(words_df['count'].quantile(0.85))
        #print(length)
        # sort results - descending
        d = words_df[words_df['count'] >= length].sort_values(['count'],
                                                              ascending=False)
        d = d['word'].tolist()
        return d

    feature_filt_dict = freq_words(final['features'])
    action_filt_dict = freq_words(final['action'])
    feeling_filt_dict = freq_words(final['feelings'])

    def remove_min_words(text, dictionary):
        text = ' '.join([word for word in text.split() if word in dictionary])
        return text

    final['feelings'] = final['feelings'].apply(remove_min_words,
                                                dictionary=feeling_filt_dict)
    final['action'] = final['action'].apply(remove_min_words,
                                            dictionary=action_filt_dict)
    final['features'] = final['features'].apply(remove_min_words,
                                                dictionary=feature_filt_dict)
    #final['adverb'] = final['adverb'].apply(remove_min_words, dictionary = feature_filt_dict)

    # Remove duplicate words in cell
    from collections import OrderedDict

    final['features'] = final['features'].str.split().apply(
        lambda x: OrderedDict.fromkeys(x).keys()).str.join('')
    final['feelings'] = final['feelings'].str.split().apply(
        lambda x: OrderedDict.fromkeys(x).keys()).str.join('')
    final['action'] = final['action'].str.split().apply(
        lambda x: OrderedDict.fromkeys(x).keys()).str.join(' ')
    #final['adverb']   = final['adverb'].str.split().apply(lambda x: OrderedDict.fromkeys(x).keys()).str.join('')
    print(final.columns)
    rows = []
    _ = final.apply(lambda row: [
        rows.append([row['features'], row['feelings'], row['action'], na])
        for na in row.noun_adj
    ],
                    axis=1)
    final = pd.DataFrame(
        rows, columns=['features', 'feelings', 'action', 'noun_adj'])

    #pd.concat([df_new.noun_adj.str.extract('(?P<col1>\d+),(?P<col2>\d+)'),df_new], axis = 1)
    final = pd.concat([
        final.noun_adj.apply(
            lambda x: pd.Series(x, index=['Feature_n', 'Feeling_adj'])), final
    ],
                      axis=1)

    final_data = pd.concat([data_final, final], axis=1)

    final_data.reset_index(inplace=True)

    print(final_data.head())
    print(final_data.dtypes)

    ## Keep Reviews that are 3 characters or longer
    #final_data = final_data[final_data['Text'].apply(lambda x: len(x) > 3)]
    mask = (final_data['Text'].str.len() > 3)
    final_data = final_data.loc[mask]
    print(final_data.columns)
    final_data = final_data.drop(columns='index')

    mycolumns = final_data.columns
    data_final_nlp = final_data[mycolumns]

    features_file = data_final_nlp[[
        'sentence', 'review_no', columnname, 'Text', 'comp_sentiment',
        'features'
    ]]

    actions_file = data_final_nlp[[
        'sentence', 'review_no', columnname, 'Text', 'comp_sentiment', 'action'
    ]]

    feelings_file = data_final_nlp[[
        'sentence', 'review_no', columnname, 'Text', 'comp_sentiment',
        'feelings'
    ]]

    noun_adj_file = data_final_nlp[[
        'sentence', 'review_no', columnname, 'Text', 'comp_sentiment',
        'Feature_n', 'Feeling_adj'
    ]]
    # Explode pandas dataframe string entry to separate rows
    import pandas as pd
    import numpy as np

    def explode(df, lst_cols, fill_value='', preserve_index=False):
        # make sure `lst_cols` is list-alike
        if (lst_cols is not None and len(lst_cols) > 0
                and not isinstance(lst_cols,
                                   (list, tuple, np.ndarray, pd.Series))):
            lst_cols = [lst_cols]
        # all columns except `lst_cols`
        idx_cols = df.columns.difference(lst_cols)
        # calculate lengths of lists
        lens = df[lst_cols[0]].str.len()
        # preserve original index values
        idx = np.repeat(df.index.values, lens)
        # create "exploded" DF
        res = (pd.DataFrame(
            {col: np.repeat(df[col].values, lens)
             for col in idx_cols},
            index=idx).assign(
                **{
                    col: np.concatenate(df.loc[lens > 0, col].values)
                    for col in lst_cols
                }))
        # append those rows that have empty lists
        if (lens == 0).any():
            # at least one list in cells is empty
            res = (res.append(df.loc[lens == 0, idx_cols]).fillna(fill_value))
        # revert the original index order
        res = res.sort_index()
        # reset index if requested
        if not preserve_index:
            res = res.reset_index(drop=True)
        return res

    #Creation of Features, Feelings and Actions File
    features_file = features_file.assign(
        features=features_file.features.str.split(' '))
    actions_file = actions_file.assign(
        action=actions_file.action.str.split(' '))
    feelings_file = feelings_file.assign(
        feelings=feelings_file.feelings.str.split(' '))
    #adverb_file = adverb_file.assign(adverb=adverb_file.adverb.str.split(' '))

    features_file = explode(features_file, ['features'], fill_value='')
    actions_file = explode(actions_file, ['action'], fill_value='')
    feelings_file = explode(feelings_file, ['feelings'], fill_value='')
    #adverb_file = explode(adverb_file, ['adverb'], fill_value='')

    features_file.to_excel("Features_file.xlsx", index=False)
    actions_file.to_excel("Actions_file.xlsx", index=False)
    feelings_file.to_excel("Feelings_file.xlsx", index=False)
    noun_adj_file.to_excel("Noun_adj_file.xlsx", index=False)

    return features_file, actions_file, feelings_file, noun_adj_file
Esempio n. 20
0
Without emperical features, the classification performance is very bad (around 60%) since each deal is very short and no big difference between good and bad deals.
I tried liear svm, decision tree, logistic regression with Lasso penalty, naive Bayesian, among which lienar SVM performs the best.
- How did you test your classifier?
Answer: Randomly sample 1/6 of training data as test data, use 5/6 as training, build model and select parameters. Repat the process for 20 times.
Finally compute average accuracy and train the model on all the training data, and apply to test_deals.txt. However 
test data is unlabeled hence I only output the prediction result. 
By manually checking, line 42,50,53 mentioned coupon codes, and our prediction results are all 1! 
"""
import nltk
import re
import numpy as np
from sklearn import svm
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
if ('off' in stopwords):
    stopwords.remove('off')
def normalise(word):
    """Normalises words to lowercase and stems and lemmatizes it."""
    stemmer = nltk.PorterStemmer()
    lemmatizer = nltk.WordNetLemmatizer()
    word = word.lower()
    word = stemmer.stem_word(word)
    word = lemmatizer.lemmatize(word)
    return word
 
def acceptable_word(word):
    """Checks conditions for acceptable word: length, stopword."""
    word= word.lower()
    accepted = bool(2 <= len(word) <= 40 and word not in stopwords)
    return accepted
def document_features(document): 
Esempio n. 21
0
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sknn.mlp import Classifier, Layer
from common import load_data
from bc_prog import bc_prog
import csv
import re
import nltk
import math

tagger = PerceptronTagger()
lemmatizer = WordNetLemmatizer()
basictizer = bc_prog()

stopwords = stopwords.words('english')
stopwords.remove("but")
stopwords.remove("not")
stopwords.remove("no")
stopwords.remove("very")

english_vocab = set(w.lower() for w in nltk.corpus.words.words())
abbrev_dict = ["'m", "n't", "'s", "'re", "'ve"]

def wordnet_pos_code(tag):
  if tag == None:
    return ''
  elif tag.startswith('NN'):
    return wordnet.NOUN
  elif tag.startswith('VB'):
    return wordnet.VERB
  elif tag.startswith('JJ'):
Esempio n. 22
0
#plt.plot([0,1],[0,1],'r--')
#plt.xlim([-0.1,1.2])
#plt.ylim([-0.1,1.2])
plt.xlabel('False Positive Rate -->')
plt.ylabel('True Positive Rate -->')
plt.show()
######## Applying DeepLearning LSTM
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf.fit(result['Reviews'])
###########################################################
from wordcloud import STOPWORDS
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
stopwords = set(STOPWORDS)
stopwords.remove("not")
count_vect = CountVectorizer(min_df=2,
                             stop_words=stopwords,
                             ngram_range=(1, 2))
tfidf_transformer = TfidfTransformer()
df_cv = count_vect.fit_transform(result["Reviews"])
df_tf = tfidf_transformer.fit_transform(df_cv)
#################################################################
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
import re
max_fatures = 30000