Beispiel #1
0
class Dictionary:
    def __init__(self):
        self.sn = SenticNet()

    """
        Input : String 
        Output : "positive" or "negative"
    """

    def get_word_polarity(self, word, log=True):
        value = "empty"
        try:
            value = self.sn.polarity_value(word.lower())
        except:
            if log:
                print('An error occurred. Word: ' + word + ' is not known.')

        return value

    """
        Input : String
        Output : Int [-1 : 1]    
    """

    def get_word_polarity_numerical_value(self, word, log=True):
        value = "empty"
        try:
            value = self.sn.polarity_intense(word.lower())
        except:
            if log:
                print('An error occurred. Word: ' + word + ' is not known.')
        return value
Beispiel #2
0
def Terms_Chooser(data, n_of_words, polarity_threshold):
    sn = SenticNet()

    #choosing words
    data["Content"] = data["Content"].apply(lambda row: nltk.word_tokenize(row))

    lista = np.array(data["Content"].values.tolist())
    lista = list(itertools.chain.from_iterable(lista))

    FD = FreqDist(lista)
    MC = FD.most_common(n_of_words)

    common_words = []
    for i in range (0,n_of_words):
        common_words.append(MC[i][0])

    polarity = list()
    words = list()

    for x in common_words:
        try:
            temp = sn.polarity_intense(x)
            if (float(temp) > polarity_threshold or float(temp) < -(polarity_threshold)):
                polarity.append(temp)
                words.append(x)
        except Exception:
            continue

    return words
    def getSentics(self, word):
        senticsAndItensity = []
        sn = SenticNet('en')
        try:
            sentics = sn.sentics(word)
            polarity_intensity = sn.polarity_intense(word)
            # print(sentics)
            # print(sentics['pleasantness'])
            # print(sentics['attention'])
            # print(sentics['sensitivity'])
            # print(sentics['aptitude'])
            # print(polarity_intensity)

            senticsAndItensity.append(float(sentics['pleasantness']))
            senticsAndItensity.append(float(sentics['attention']))
            senticsAndItensity.append(float(sentics['sensitivity']))
            senticsAndItensity.append(float(sentics['aptitude']))
            senticsAndItensity.append(float(polarity_intensity))

            return senticsAndItensity

        except Exception as e:
            defaultsentics = [0.0, 0.0, 0.0, 0.0, 0.0]
            return defaultsentics


# ##TESTING AREA
# yas = SenticValuer()
# print(yas.getSentics("awkward"))
Beispiel #4
0
from senticnet.senticnet import SenticNet

sn = SenticNet()
print("polarity value:", sn.polarity_value("love"))
print("polarity intense:", sn.polarity_intense("love"))
print("moodtags:", ", ".join(sn.moodtags("love")))
print("semantics:", ", ".join(sn.semantics("love")))
print("\n".join([key + ": " + str(value) for key, value in sn.sentics("love").items()]))
class Sarcasm:
    def __init__(self, *args, **kwargs):
        # loading necessaries
        self.nlp = spacy.load("en_core_web_sm")
        self.senti = SenticNet()
        self.sid = SentimentIntensityAnalyzer()
        #loading dataset
        self.df = pd.read_json("./Sarcasm_Headlines_Dataset.json", lines=True)
        self.df = self.df[:15000]
        self.df.drop(columns="article_link",
                     inplace=True)  #dropping unnessary attribute
        #storing nlp data in headlines variable
        self.headlines = []
        self.uni_gram = set()
        self.uni_feature = []
        self.y_ = []
        for i in self.df['headline']:
            self.headlines.append(self.nlp(i))

    def w_score(self, w):
        """
        input: word
        Finding word score based on nltk's vader_lexicon sentiment analysis
        and Senticnet sentiment analysis
        """
        ss = self.sid.polarity_scores(w)['compound']
        try:
            sn = self.senti.polarity_intense(w)
            sn = float(sn)
            if ss == 0:
                return sn
            else:
                return (sn + ss) / 2

        except:
            #not found in sn find for only ss or concepts
            if ss != 0:
                return ss
            elif ss == 0:  #find for the concepts
                return ss

    def sentimentScore(self, sent):
        """
        input: sentence
        Return if contradiction occurs 
        or not 
        """
        sum_pos_score = 0
        sum_neg_score = 0
        for w in sent:
            if w.lemma_ == '-PRON-':
                score = self.w_score(w.text)
            else:
                score = self.w_score(w.lemma_)
            if score > 0:
                sum_pos_score += score
            else:
                sum_neg_score += score
        if sum_pos_score > 0 and sum_neg_score < 0:
            return ("contradict", sum_pos_score, sum_neg_score)
        else:
            return ("anything", sum_pos_score, sum_neg_score)

    def coherence(self, s1, s2):
        '''
        Input sentence1, sentence2 using nlp
        Rule1:- Pronoun match feature - including reflexive, personal, and possessive pronouns.
        Rule2:- String match feature - ignore stop words
        Rule3:- Definite noun phrase - w2 starts with the word 'the'
        Rule4:- Demonstrative noun phrase feature - w2 starts with the "this", "that", "these" and "those"
        Rule5:- Both proper names features - w1 and w2 are both named entities
        '''
        # subject and object of s1 and s2
        sub1 = ""
        sub2 = ""
        obj1 = ""
        obj2 = ""

        for i in s1.noun_chunks:
            if i.root.dep_ == 'nsubj':
                sub1 = i.root
            if i.root.dep == 'pobj':
                obj1 = i.root
        for j in s2.noun_chunks:
            if j.root.dep_ == 'nsubj':
                if type(sub1) != type(
                        "") and sub1.pos_ == 'PRON' and j.root.pos_ == 'PRON':
                    if sub1.text.lower() == j.root.text.lower():
                        return "coherent"
                # rule 4:-

                if j[0].text.lower() == 'the':
                    return "coherent"
                if j[0].text.lower() in ['this', 'that', 'these', 'those']:
                    return "coherent"
            if j.root.dep_ == 'pobj':
                if type(obj1) != type(
                        "") and obj1.pos_ == 'PRON' and j.root.pos_ == 'PRON':
                    if obj1.text.lower() == j.root.text.lower():
                        return "coherent"
        return "Not coherent"

    def to_string_from_list(self, l):
        st = ""
        for i in l:
            st += i + ' '
        return st.rstrip()

    def n_gram_feature(self, text, n):
        """
        Input: headline in nlp
        Finding n grams of given text
        """
        one_list = []
        for tok in text:
            if not tok.is_punct:
                if tok.lemma_ != '-PRON-':
                    one_list.append(tok.lemma_)
                else:
                    one_list.append(tok.text)
        try:
            one_list.remove(' ')
        except:
            pass
        #convert it to n-gram
        _list = []
        for i, t in enumerate(one_list):
            if len(one_list[i:n + i]) >= n:
                _list.append(self.to_string_from_list(one_list[i:n + i]))
        return set(_list)

    def contradiction_feature(self, headline):
        '''
        Contradiction feature 
        input: nlp processed 
        '''
        #for single sentence headline
        if len(list(headline.sents)) == 1:
            if self.sentimentScore(headline)[0] == 'contradict':
                return (1, 0)
            else:
                return (0, 0)
        #for multisentence headline
        else:
            if self.sentimentScore(headline)[0] == 'contradict':
                sent = list(headline.sents)
                i = 0
                while i < len(sent) - 1:
                    # number of sentece
                    if self.coherence(sent[i], sent[i + 1]) is not "coherent":
                        return (0, 0)
                    i += 1
                return (0, 1)

            else:
                return (0, 0)

    def baseline3(self):
        '''
        Use of sentiment analysis + coherence
        '''
        predictions = []
        for i in self.headlines:
            get = self.contradiction_feature(i)
            if get == (1, 0) or get == (0, 1):
                predictions.append(1)
            else:
                predictions.append(0)
        return (confusion_matrix(self.df['is_sarcastic'], predictions),
                classification_report(self.df['is_sarcastic'], predictions),
                accuracy_score(self.df['is_sarcastic'], predictions))

    def baseline1(self):
        predictions = []
        for p in self.headlines:
            co, _, _ = self.sentimentScore(p)
            if (co == 'contradict'):
                predictions.append(1)
            else:
                predictions.append(0)
        return (confusion_matrix(self.df['is_sarcastic'], predictions),
                classification_report(self.df['is_sarcastic'], predictions),
                accuracy_score(self.df['is_sarcastic'], predictions))

    def uni_gram_features(self, start, end, n=1):
        self.uni_gram = list(self.uni_gram)
        self.uni_gram = sorted(self.uni_gram)
        index = start
        for p in self.headlines[start:end]:
            uni = [0 for i in range(len(self.uni_gram))]
            for i, j in enumerate(p):
                temp = []  #temp
                if len(p[i:n + i]) >= n:
                    for k in range(n):

                        if p[i + k].lemma_ != '-PRON-':
                            temp.append(p[i + k].lemma_)
                        else:
                            temp.append(p[i + k].text)

                    temp = self.to_string_from_list(temp)
                    if temp in self.uni_gram:
                        uni[self.uni_gram.index(temp)] = 1
            self.y_.append(self.df['is_sarcastic'][index])
            index += 1
            self.uni_feature.append(uni)

    def baseline2(self, n=1):
        #unigram features
        self.uni_gram = set()
        self.uni_feature = []
        self.y_ = []
        for p in self.headlines:
            self.uni_gram = self.uni_gram.union(self.n_gram_feature(p, n))

        #now find
        length = len(self.headlines)
        t1 = threading.Thread(target=self.uni_gram_features,
                              name='t1',
                              args=(0, int(length / 4), n))
        t2 = threading.Thread(target=self.uni_gram_features,
                              name='t2',
                              args=(int(length / 4), int(length / 2), n))
        t3 = threading.Thread(target=self.uni_gram_features,
                              name='t3',
                              args=(int(length / 2), int(3 * length / 4), n))
        t4 = threading.Thread(target=self.uni_gram_features,
                              name='t4',
                              args=(int(3 * length / 4), length, n))
        t1.daemon = True
        t2.daemon = True
        t3.daemon = True
        t4.daemon = True
        st = time.time()
        t1.start()
        t2.start()
        t3.start()
        t4.start()
        t1.join()
        t2.join()
        t3.join()
        t4.join()
        print(f'time taken: {time.time()-st}')
        X_train, X_test, y_train, y_test = train_test_split(self.uni_feature,
                                                            self.y_,
                                                            test_size=0.33,
                                                            random_state=42)
        return self.findLINEARSVCResult(X_train, X_test, y_train, y_test)

    def findLINEARSVCResult(self, X_train, X_test, y_train, y_test):
        '''
         Training data using LinearSVC model
        '''
        svc_model = LinearSVC()
        svc_model.fit(X_train, y_train)
        predictions = svc_model.predict(X_test)
        return (confusion_matrix(y_test, predictions),
                classification_report(y_test, predictions),
                accuracy_score(y_test, predictions))
Beispiel #6
0
def data_Preprocessing(data, data_test, n_of_words, polarity_threshold):
    Reviews = data["Content"]
    #Check if all char are ASCII
    # If we need another method for Encode/Decode there is string.printable method
    for i in range(0, len(Reviews)):
        x = Reviews.iloc[i].encode('ascii', errors='ignore').decode()

    # Set all the content to lower case
    Reviews = Reviews.apply(lambda row: row.lower())

    # Add to the follow variable the characters that you want to delete
    chars_to_del = "[" + string.punctuation + string.digits + "]"
    # Delete all the chars in "chars_to_del" from each row of the dataframe
    Reviews = Reviews.apply(lambda row: re.sub(chars_to_del, '', row))
    # Tokenize every single words of the data content
    Token_Reviews = Reviews.apply(lambda row: nltk.word_tokenize(row))

    # Generating the list "stop" of element TO BE REMOVED from the sentences (stopwords, numbers and punctuations)
    stop = stopwords.words("english")
    # Remove all the words in the variable "stop"
    Filtered_Review = Token_Reviews.apply(
        lambda row: [w for w in row if not w in stop])

    # Stemming the data's content
    # Stemming the Filtered sentence, some stemmed words:
    # http://snowball.tartarus.org/algorithms/english/stemmer.html
    ps = PorterStemmer()
    for idx in range(0, len(Filtered_Review)):
        Stemmed_Review_temp = []
        for word in Filtered_Review.iloc[i]:
            Stemmed_Review_temp.append(ps.stem(word))
        Filtered_Review.iloc[i] = Stemmed_Review_temp

    # Terms choosing: most common word
    sn = SenticNet()

    Filtered_Review_List = list(itertools.chain.from_iterable(Filtered_Review))
    Words_Frquency = FreqDist(Filtered_Review_List)
    Most_Common_Words_Frequency = Words_Frquency.most_common(n_of_words)

    Most_Common_Words = []
    for i in range(0, n_of_words):
        Most_Common_Words.append(Most_Common_Words_Frequency[i][0])

    index = 1
    words_and_polarity = pd.DataFrame(columns=["Word", "Polarity"])
    Selected_Words = []
    # Terms polarity
    for word in Most_Common_Words:
        try:
            temp = sn.polarity_intense(word)
            if (float(temp) > polarity_threshold
                    or float(temp) < -(polarity_threshold)):
                words_and_polarity.loc[index] = [word, float(temp)]
                index = index + 1
                Selected_Words.append(word)
        except Exception:
            continue
    # Decomment if you want to recomputer the selected words and their polarity
    #words_and_polarity.to_csv("Words_and_Polarity.csv", sep=",")

    return data, data_test
Beispiel #7
0

sn = SenticNet()
zeroSen = 0
tp = 0
tn = 0
fp = 0
fn = 0
actT = 0
with open("Dataset.pickle", "rb") as handle:
    pyDS = pickle.load(handle)
    for doc in pyDS.DocList:
        totSen = 0
        for w in doc.TermList:
            try:
                sen = sn.polarity_intense(w)
            except KeyError:
                sen = 0

            # if w in slangwords:
            #     sen = slangwords[w]
            # elif w in kaggeleSentiment:
            #     sen = kaggeleSentiment[w]
            # elif w in porvalis:
            #     sen = porvalis[w]

        totSen = totSen + float(sen)
        doc.Sentiment = totSen
        #  true             positive
        if doc.Class == 1 and totSen < 0.5:
            tp = tp + 1
Beispiel #8
0
# Each line of corpus must be equivalent to each document of the corpus
#boc_model=boc.BOCModel(doc_path="input corpus path")
boc_model = boc.BOCModel('text.txt')

#boc_model.context = text

# output can be saved with save_path parameter
boc_matrix, word2concept_list, idx2word_converter = boc_model.fit()

# SenitcNet lexicon lookup
from senticnet.senticnet import SenticNet

sn = SenticNet()

concept_info = sn.concept(text)
polarity_value = sn.polarity_value(text)
polarity_intense = sn.polarity_intense(text)
moodtags = sn.moodtags(text)
semantics = sn.semantics(text)
sentics = sn.sentics(text)

print('==================================')
print('test: ', text)
print('concept_info: ', concept_info)
print('polarity_value: ', polarity_value)
print('polarity_intense: ', polarity_intense)
print('moodtags: ', moodtags)
print('semantics: ', semantics)
print('sentics: ', sentics)
print('==================================')
        negative_words
    )  ##wordcloud using frequencies ( this needs an dictionary object)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

#Most positive and Most Negative Reviews

Review_polarity_df = pd.DataFrame()
for review in clean_reviews_final.without_stopwords:
    tokens = word_tokenize(review)
    count = 0
    pol = 0
    for word in tokens:
        if word in sn.data:
            pol = pol + float(sn.polarity_intense(word))
            count = count + 1
    if count > 0:
        polarity = pol / count
    else:
        polarity = 0
    temp_df = pd.DataFrame([review, polarity]).T
    Review_polarity_df = pd.concat([Review_polarity_df, temp_df],
                                   ignore_index=True)

Review_polarity_df.columns = ['review', 'polarity']

Reviews_Sorted = Review_polarity_df.sort_values(by=['polarity'],
                                                ascending=[False])

Most_positive_Reviews = Review_polarity_df.sort_values(
Beispiel #10
0
                            datas[idx]['ContextVector'][1] += count[f]['sent']
                    if tags[tokens.index(w)][1] in nn:
                        if w in count.keys():
                            datas[idx]['ContextVector'][2] = count[w]['sent']
                    elif tags[tokens.index(w)][1] in vb:
                        if w in count.keys():
                            datas[idx]['ContextVector'][3] = count[w]['sent']
                    # print(sentiment_feature[f][w]['sent'])

    for word, tag in tags:
        if tag in adj:
            if word in count.keys():
                datas[idx]['DsVector'][0] += count[word]['sent']
                datas[idx]['PmiVector'][0] += count[word]['PMI_sent']
            if word in sn.data.keys():
                datas[idx]['SnVector'][0] += float(sn.polarity_intense(word))
            if word in bl_sent.keys():
                datas[idx]['BlVector'][0] += bl_sent[word]
        elif tag in adv:
            if word in count.keys():
                datas[idx]['SnVector'][1] += count[word]['sent']
                datas[idx]['PmiVector'][1] += count[word]['PMI_sent']
            if word in sn.data.keys():
                datas[idx]['SnVector'][1] += float(sn.polarity_intense(word))
            if word in bl_sent.keys():
                datas[idx]['BlVector'][1] += bl_sent[word]
        elif tag in nn:
            if word in count.keys():
                datas[idx]['DsVector'][2] = count[word]['sent']
                datas[idx]['PmiVector'][2] += count[word]['PMI_sent']
            if word in sn.data.keys():
Beispiel #11
0
def pre_process_and_predict(sentence):
    wordnet_lemmatizer = WordNetLemmatizer()
    # # Replacing double quotes with single, within a string
    sentence = sentence.replace("\"", "\'")
    # # Removing unnecessary special characters, keeping only ,  ! ?
    sentence = re.sub(r"[^!?,a-zA-Z0-9\ ]+", '', sentence)
    # # Lemmatization on verbs
    sentence = ' '.join([
        wordnet_lemmatizer.lemmatize(word, pos='v')
        for word in word_tokenize(sentence)
    ])

    sn = SenticNet()
    senti = PySentiStr()
    senti.setSentiStrengthPath(CODE_PATH + '/sentistrength/SentiStrength.jar')
    senti.setSentiStrengthLanguageFolderPath(
        CODE_PATH + '/sentistrength/SentStrength_Data/')

    sentiment_score = []

    for sen in sent_tokenize(sentence):
        senti_pos, senti_neg = senti.getSentiment(sen, score='dual')[0]
        senti_pos -= 1
        if senti_neg == -1:
            senti_neg = 0
        sum_pos_score = 0
        sum_neg_score = 0
        for word in word_tokenize(sen):
            try:
                w_score = float(sn.polarity_intense(word)) * 5
            except KeyError:
                w_score = 0
            if w_score > 0:
                sum_pos_score = sum_pos_score + w_score
            elif w_score < 0:
                sum_neg_score = sum_neg_score + w_score
        sum_pos_score = (sum_pos_score + senti_pos) / 2
        sum_neg_score = (sum_neg_score + senti_neg) / 2
        sentiment_score.append((sum_pos_score, sum_neg_score))
    additional_features_s = []
    additional_features_ns = []

    contra = []
    pos_low = []
    pos_medium = []
    pos_high = []
    neg_low = []
    neg_medium = []
    neg_high = []

    for sum_pos_score, sum_neg_score in sentiment_score:
        contra.append(int(sum_pos_score > 0 and abs(sum_neg_score) > 0))
        pos_low.append(int(sum_pos_score < 0))
        pos_medium.append(int(sum_pos_score >= 0 and sum_pos_score <= 1))
        pos_high.append(int(sum_pos_score >= 2))
        neg_low.append(int(sum_neg_score < 0))
        neg_medium.append(int(sum_neg_score >= 0 and sum_neg_score <= 1))
        neg_high.append(int(sum_neg_score >= 2))
    additional_features_s = additional_features_s + [
        max(pos_medium),
        max(pos_high),
        max(neg_medium),
        max(neg_high)
    ]
    additional_features_ns = additional_features_ns + [
        max(pos_low), max(neg_low)
    ]

    tweet = sentence
    punctuation_count = SequencePunctuationCount(tweet)
    character_count = SequenceCharacterCount(tweet)
    capitalized_count = CapitalizedCount(tweet)
    exclamation_count = ExclamationCount(tweet)
    #     emoji_count       = EmojiCount(tweet)
    f_count = [
        punctuation_count, character_count, capitalized_count,
        exclamation_count
    ]
    for count in f_count:
        f_low = int(count == 0)
        f_medium = int(count >= 1 and count <= 3)
        f_high = int(count >= 4)
        additional_features_s = additional_features_s + [f_medium, f_high]
        additional_features_ns = additional_features_ns + [f_low]
    X = [sentence]

    in_file = open(os.path.join(PICKLES_PATH, "vocab.pickle"), "rb")
    vocab = pickle.load(in_file)
    in_file.close()

    in_file = open(os.path.join(PICKLES_PATH, "model.pickle"), "rb")
    model = pickle.load(in_file)
    in_file.close()

    vectorizer = TfidfVectorizer(vocabulary=vocab)
    X = vectorizer.fit_transform(X)
    ans = int(sum(model.predict(X)))
    print('Sentence : ', sentence)
    print('Sarcastic features : ', additional_features_s)
    print('Not Sarcastic features : ', additional_features_ns)
    print('Contradict : ', max(contra))
    print('Model Predict : ', ans)
    print(
        'My obs : ',
        int((sum(additional_features_s) >= sum(additional_features_ns))
            and max(contra) == 1))
    print('Final Prd : ', end='')

    if ans == 1 or ((sum(additional_features_s) >= sum(additional_features_ns))
                    and max(contra) == 1):
        return True
    else:
        return False
from senticnet.senticnet import SenticNet

teste = []
sn = SenticNet('pt')
concept_info = sn.concept('amor')
polarity_value = sn.polarity_value('amor')
polarity_intense = sn.polarity_intense('amor')
moodtags = sn.moodtags('amor')
semantics = sn.semantics('amor')
sentics = sn.sentics('amor')

teste.append(concept_info)

print(teste)