Beispiel #1
0
def doc_sentiment(data):
    """
    """
    #Call SenticNet module
    sn = SenticNet()
    
    #Create positive and negative variables
    total_sentiment = 0
    
    #Calculate sentiment for all words in document
    for i in range(len(data)):
        #If words don't exist in SenticNet vocabulary they will return an error
        #We treat these words as if they have a sentiment of 0
        try:
            #Calculate sentiment of word
            sentiment = sn.polarity_value(data[i])
            #Update total sentiment
            total_sentiment += float(sentiment)
            
        except:
            None
    
    try:
        #If total sentiment = 0 division errors will occur
        #Calculate average sentiment for the document
        avg_sentiment = total_sentiment/len(data)
    except:
        avg_sentiment = 0
        
    if avg_sentiment >= 0:
        output = 1
    else:
        output = 0
    
    return output
Beispiel #2
0
def Terms_Chooser(data, n_of_words, polarity_threshold):
    sn = SenticNet()

    #choosing words
    data["Content"] = data["Content"].apply(lambda row: nltk.word_tokenize(row))

    lista = np.array(data["Content"].values.tolist())
    lista = list(itertools.chain.from_iterable(lista))

    FD = FreqDist(lista)
    MC = FD.most_common(n_of_words)

    common_words = []
    for i in range (0,n_of_words):
        common_words.append(MC[i][0])

    polarity = list()
    words = list()

    for x in common_words:
        try:
            temp = sn.polarity_intense(x)
            if (float(temp) > polarity_threshold or float(temp) < -(polarity_threshold)):
                polarity.append(temp)
                words.append(x)
        except Exception:
            continue

    return words
Beispiel #3
0
class Dictionary:
    def __init__(self):
        self.sn = SenticNet()

    """
        Input : String 
        Output : "positive" or "negative"
    """

    def get_word_polarity(self, word, log=True):
        value = "empty"
        try:
            value = self.sn.polarity_value(word.lower())
        except:
            if log:
                print('An error occurred. Word: ' + word + ' is not known.')

        return value

    """
        Input : String
        Output : Int [-1 : 1]    
    """

    def get_word_polarity_numerical_value(self, word, log=True):
        value = "empty"
        try:
            value = self.sn.polarity_intense(word.lower())
        except:
            if log:
                print('An error occurred. Word: ' + word + ' is not known.')
        return value
def senticnet(text):
    """
    Returns a list obtained from SenticNet with the following four features normalized: [pleasantness_value, attention_value, sensiivity_value, aptitude_value]

    :param text: input text pre-processed by Spacy
    :return: a list with the SenticNet features averaged for all the words in text
    """
    list_features = [0] * 4
    sn = SenticNet()
    count_words = 0

    for token in text:
        try:
            concept_info = sn.concept(token)
            list_features[0] += float(concept_info['sentics']['pleasantness'])
            list_features[1] += float(concept_info['sentics']['attention'])
            list_features[2] += float(concept_info['sentics']['sensitivity'])
            list_features[3] += float(concept_info['sentics']['aptitude'])
            count_words += 1
        except KeyError:
            pass

    if count_words != 0:
        list_features = [feature / count_words for feature in list_features]

    return list_features
    def getSentics(self, word):
        senticsAndItensity = []
        sn = SenticNet('en')
        try:
            sentics = sn.sentics(word)
            polarity_intensity = sn.polarity_intense(word)
            # print(sentics)
            # print(sentics['pleasantness'])
            # print(sentics['attention'])
            # print(sentics['sensitivity'])
            # print(sentics['aptitude'])
            # print(polarity_intensity)

            senticsAndItensity.append(float(sentics['pleasantness']))
            senticsAndItensity.append(float(sentics['attention']))
            senticsAndItensity.append(float(sentics['sensitivity']))
            senticsAndItensity.append(float(sentics['aptitude']))
            senticsAndItensity.append(float(polarity_intensity))

            return senticsAndItensity

        except Exception as e:
            defaultsentics = [0.0, 0.0, 0.0, 0.0, 0.0]
            return defaultsentics


# ##TESTING AREA
# yas = SenticValuer()
# print(yas.getSentics("awkward"))
def sem(d):
    try:
        sn = SenticNet()
        sn.semantics(d)
        return True
    except KeyError:
        return False
Beispiel #7
0
def fun1(d):
    try:
        from senticnet.senticnet import SenticNet
        sn = SenticNet()
        sn.semantics(d)
        return True
    except KeyError as error:
        return False
def get_emotions(tokens):
    from senticnet.senticnet import SenticNet
    result = {}
    sn = SenticNet()
    for token in tokens:
        moodtags = ""

        if token in sn.data:
            moodtags = sn.moodtags(token)
            print(token, moodtags)
    #TODO
    return result
 def __init__(self):
     print("Start SenticNet - Sentiment Analysis")
     self.sp = Support()
     self.sn = SenticNet()
     self.corpus = self.sp.import_corpus_bank()
     self.terminology = self.sp.import_bank_terminology(filename='bank_terminology')
     self.data, self.label = self.sp.process_data(filename='bank_message',
                                             size_msg=3,
                                             clean=True,
                                             replace_text=True,
                                             stemmed=None,
                                             lemmatize=None,
                                             spelling=None)
def getMaxSum_senti(text):



    wnl = WordNetLemmatizer()
    sn = SenticNet()
    sentences = nltk.sent_tokenize(text)

    text_sentiAvg = 0
    sentence_maxSenti = 0

    for index in range(len(sentences)):
        sentence = sentences[index].strip()
        sentence = sentence[0:-1]

        assert '.' not in sentence
        words = nltk.word_tokenize(sentence.lower())
        pos_tags = nltk.pos_tag(words)
        sentence_sentiSum = getSentenceSentiSum(pos_tags, wnl, sn)
        # print sentence_sentiSum,

        if sentence_sentiSum > sentence_maxSenti:
            sentence_maxSenti = sentence_sentiSum

        text_sentiAvg += sentence_sentiSum

    text_sentiAvg = text_sentiAvg / len(sentences)
    text_sentiAvg = round(text_sentiAvg, 6)
    sentence_maxSenti = round(sentence_maxSenti, 6)

    return text_sentiAvg, sentence_maxSenti
Beispiel #11
0
class SenticNets(BaseEstimator, TransformerMixin):
    def __init__(self, vocab):
        self.vocab = vocab
        self.X_width = len(vocab)
        self.sn = SenticNet()

    def fit(self, X):
        return self

    def vector(self, X):
        X = X.split(' ')
        zeros = csr_matrix((1, self.X_width))
        for word in range(len(X)):
            if not X[word] in self.vocab:
                continue
            try:
                score = self.sn.polarity_value(X[word])
            except KeyError:
                continue
            zeros[0, self.vocab[X[word]]] = score
        return zeros

    def transform(self, X):
        self.zeros = csr_matrix((0, self.X_width))
        self.X_length = X.shape[0]
        for i in range(self.X_length):
            self.zeros = vstack([self.zeros, self.vector(X[i])])
        return self.zeros

    def fit_transform(self, X, y=None):
        self.fit(X)
        self.transform(X)
        return self.zeros
 def __init__(self, *args, **kwargs):
     # loading necessaries
     self.nlp = spacy.load("en_core_web_sm")
     self.senti = SenticNet()
     self.sid = SentimentIntensityAnalyzer()
     #loading dataset
     self.df = pd.read_json("./Sarcasm_Headlines_Dataset.json", lines=True)
     self.df = self.df[:15000]
     self.df.drop(columns="article_link",
                  inplace=True)  #dropping unnessary attribute
     #storing nlp data in headlines variable
     self.headlines = []
     self.uni_gram = set()
     self.uni_feature = []
     self.y_ = []
     for i in self.df['headline']:
         self.headlines.append(self.nlp(i))
Beispiel #13
0
    def sentiment_avg(self, text):
        sn = SenticNet('pt')
        list_polarity = []
        qtd_words = len(text)
        temp = text.split()
        avg_n = 0
        for i in range(len(temp)):
            try:
                polarity_value = sn.polarity_value(
                    self.treatment_string(temp[i]))
                list_polarity.append(polarity_value)
            except:
                qtd_words -= 1
                i += 1

        avg_n = self.avg(list_polarity, qtd_words)
        if avg_n > 0.003 or avg_n < -0.003:
            return True
        else:
            return False
class SenticNetSA:

    def __init__(self):
        print("Start SenticNet - Sentiment Analysis")
        self.sp = Support()
        self.sn = SenticNet()
        self.corpus = self.sp.import_corpus_bank()
        self.terminology = self.sp.import_bank_terminology(filename='bank_terminology')
        self.data, self.label = self.sp.process_data(filename='bank_message',
                                                size_msg=3,
                                                clean=True,
                                                replace_text=True,
                                                stemmed=None,
                                                lemmatize=None,
                                                spelling=None)

    def baseline(self):
        TP = 0
        FP = 0
        FN = 0
        x_train, x_test, y_train, y_test = train_test_split(self.data, self.label, test_size=0.20, random_state=1000)
        for i in range (0, len(x_train)):
            msg = str(x_train[i])
            value = float(y_train[i])
            result = self.sn.message_concept(msg)
            polarity_value = float(result['polarity_value'])
            polarity_value = 0.0 if polarity_value < 0.10 or polarity_value > -0.1 else polarity_value
            if value == polarity_value:
                TP += 1
            else:
                FP += 1
                if value == 1 and (polarity_value == 0.0 or polarity_value == -1.0):
                    FN += 1
                elif value == 0.0 and (polarity_value == 1 or polarity_value == -1.0):
                    FN += 1
                elif value == -1.0 and (polarity_value == 0.0 or polarity_value == 1.0):
                    FN += 1

        precision = TP/(TP + FP)
        recall = TP / (TP + FN)
        f1 = 2*((precision*recall) / (precision + recall))
        print("f1-score : {}%".format(round(f1 * 100, 2)))
Beispiel #15
0

    # untuk mencari frekuensi kemunculan kata (menggunakan list asli, bukan list kata unik)
    from nltk.probability import FreqDist
    fdist = FreqDist(ALL_filtered_sentence)
    print("fdist : {}".format(fdist))
    print(fdist.most_common(200))

    # ------------------------------------------------------------SENTICNET
    from MdCek.DBRepository.WordList_sentic_Repository import WordList_sentic_Repository
    from MdCek.Model.WordList_sentic import WordList_sentic
    import pprint
    from senticnet.senticnet import SenticNet

    wordSentic = WordList_sentic_Repository()
    sn = SenticNet()
    print("-----------------------------------------membaca wordlist depresion yang sudah ada dari NLTK dan textblob")
    # EXISTING WORDLIST NEGATIVE

    existWordlistDepression = wordList_depRepo.read()

    lemmaOfExistWordlist = []
    conceptExist = []
    conceptExistNegative = []
    objsConceptExistNegative = []

    conceptExistPositive = []
    objsConceptExistPositive = []
    sisa = 0

    # print("Jumlah existing wordlist : {} ".format(len(existWordlistDepression)))
document_term_matrix_idf.head(10)

#  word cloud
#  wordcloud using TFIDF of words, some of the docs some words keep repeating in all the docs, (Science) It works as log functions to normalize the words
words = dict(document_term_matrix_idf.apply(
    sum, axis=0))  ## this needs an dictionary object
wordcloud = WordCloud(
    max_font_size=40, max_words=50, background_color="white").fit_words(
        words)  #  fit_words() is used to plot wordcloud using dictionary.
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

#####  positive and negative words using user-built lexicons, plotting their wordclouds.
sn = SenticNet()
positive_words = []
negative_words = []
for word in vectorizer.get_feature_names():
    if word in sn.data:
        if sn.polarity_value(word) == 'positive':
            positive_words.append(word)
        if sn.polarity_value(word) == 'negative':
            negative_words.append(word)

len(positive_words)
len(negative_words)

positive_words = dict(document_term_matrix[positive_words].apply(sum, axis=0))
negative_words = dict(document_term_matrix[negative_words].apply(sum, axis=0))
Beispiel #17
0
            for _, records in groupby(sorted(lst, key=keyprop), keyprop)
        ]


    a = [{'time': '25 APR', 'total': 10, 'high': 10},
         {'time': '26 APR', 'total': 5, 'high': 5}]

    b = [{'time': '24 APR', 'total': 10, 'high': 10},
         {'time': '26 APR', 'total': 15, 'high': 5}]
    merger = merge_list_of_records_by('time', add)
    hasil_merge = merger(a+b)
    print(hasil_merge)

    print("sinonim with thesaurus==================================================================")
    # from PyDictionary import PyDictionary
    #
    # dictionary = PyDictionary()
    # print(dictionary.synonym("good"))

    from thesaurus import Word

    w = Word('suicidal')
    syn = w.synonyms()
    print(syn)

    sn = SenticNet()
    try:
        concept_info_sinonim = sn.concept("suicidal")
        print(concept_info_sinonim)
    except Exception as e:
        print(e)
def get_clues(text):
    text = text
    print("*--------(%s)-------------*" % (text))
    print(type(text))
    nlp = StanfordCoreNLP('http://localhost:9001')
    stop_words = set(stopwords.words('english'))
    '''
		Method to remove numbers appended at last
	'''
    dep_parse = nlp.annotate(text,
                             properties={
                                 'annotators': 'depparse',
                                 'outputFormat': 'json',
                                 'timeout': 10000,
                             })

    pos = nlp.annotate(text,
                       properties={
                           'annotators': 'lemma',
                           'outputFormat': 'json',
                           'timeout': 10000,
                       })

    sn = SenticNet()
    word_to_dep = [{} for i in range(len(dep_parse['sentences']))]
    word_to_par = [{} for i in range(len(dep_parse['sentences']))]
    word_to_pos = [{} for i in range(len(dep_parse['sentences']))]
    word_to_lemma = [{} for i in range(len(dep_parse['sentences']))]
    word_to_child = [{} for i in range(len(dep_parse['sentences']))]
    sents = [[] for i in range(len(dep_parse['sentences']))]
    index_to_word = {}
    '''
		Constructing dicts for maintaining the dependencies among words. 
	'''
    '''
		Appending each word by occurence number to maintain distinct word count
	'''
    #print(dep_parse['sentences'])
    print("********")
    for i, sent in enumerate(dep_parse['sentences']):
        for dep in sent['basicDependencies']:
            word_to_dep[i][dep['dependentGloss'] +
                           str(dep['dependent'])] = dep['dep']
            word_to_par[i][dep['dependentGloss'] +
                           str(dep['dependent'])] = dep['governorGloss'] + str(
                               dep['governor'])
            index_to_word[dep['dependentGloss'] +
                          str(dep['dependent'])] = dep['dependentGloss']

            if (dep['governorGloss'] + str(dep['governor'])
                    not in word_to_child[i]):
                word_to_child[i][dep['governorGloss'] +
                                 str(dep['governor'])] = []
            if (dep['dependentGloss'] + str(dep['dependent'])
                    not in word_to_child[i]):
                word_to_child[i][dep['dependentGloss'] +
                                 str(dep['dependent'])] = []
            word_to_child[i][dep['governorGloss'] +
                             str(dep['governor'])].append(
                                 dep['dependentGloss'] + str(dep['dependent']))
            sents[i].append(dep['dependentGloss'] + str(dep['dependent']))
        word_to_dep[i]['ROOT0'] = 'root'
        word_to_par[i]['ROOT0'] = 'root'

    for i, sent in enumerate(pos['sentences']):
        for pos_tagger in sent['tokens']:
            word_to_pos[i][pos_tagger['word']] = pos_tagger['pos']
            word_to_lemma[i][pos_tagger['word']] = pos_tagger['lemma']
        word_to_pos[i]['ROOT'] = 'root'
        word_to_lemma[i]['ROOT'] = 'root'
    '''
		Displaying the deps
	'''

    ##Implemeting rules to extract aspects
    for i, sent in enumerate(sents):
        if (__name__ == '__main__'):
            print(word_to_dep[i], word_to_par[i], word_to_pos[i])
            print("Children==>")
            print(word_to_child[i])

    aspects = []
    for i, sent in enumerate(sents):
        for word in sent:
            '''
				Rule 0
			'''
            if ('subj' in word_to_dep[i][word]):
                for child in word_to_child[i][word_to_par[i][word]]:
                    if ('amod' in word_to_dep[i][child]
                            or 'advmod' in word_to_dep[i][child]):
                        aspects.append(word_to_par[i][word])
                        if (__name__ == '__main__'):
                            print("Rule 0 triggered.")
            '''
				Rule 1 (without sub): Very big to hold.
			'''
            if (word_to_dep[i][word] == 'xcomp' and
                ('JJ' in word_to_pos[i][index_to_word[word_to_par[i][word]]] or
                 'RB' in word_to_pos[i][index_to_word[word_to_par[i][word]]])):
                if (__name__ == '__main__'):
                    print("Rule 1 triggered")
                aspects.append(word_to_par[i][word])
            '''
				Rule 2 (without subj): Not to mention the price of the phone
			'''
            if (word_to_dep[i][word] == 'dobj' and 'VB'
                    in word_to_pos[i][index_to_word[(word_to_par[i][word])]]
                    and ('NN' in word_to_pos[i][index_to_word[(word)]]
                         or 'JJ' in word_to_pos[i][index_to_word[(word)]])):
                aspects.append(word)
                if (__name__ == '__main__'):
                    print("Rule 2 triggered")
                    print(word)
            '''
				Rule 3 (without subj): Love the sleekness of the player
			'''

            if ('NN' in word_to_pos[i][index_to_word[(word)]]
                    and word_to_dep[i][word] == 'nmod'):
                aspects.append(word_to_par[i][word])
                if (__name__ == '__main__'):
                    print("Rule 3 triggered")
                    print(word_to_par[i][word])
                '''
				Rule 4 (with sub): The battery lasts little 
				two aspects 
			'''
            if (word_to_dep[i][word] == 'advmod'
                    or word_to_dep[i][word] == 'amod' or word_to_dep[i][word]
                    == 'advcl') and ('VB' in word_to_pos[i][index_to_word[(
                        word_to_par[i][word])]]):
                aspects.append(word_to_par[i][word])
                for word2 in sent:
                    if (word2 != word and word_to_dep[i][word2] == 'nsubj'
                            and word_to_par[i][word2] == word_to_par[i][word]
                            and
                        ('NN' in word_to_pos[i][index_to_word[word2]]
                         or 'JJ' in word_to_pos[i][index_to_word[word2]])):
                        aspects.append(word2)
                        if (__name__ == '__main__'):
                            print("Rule 4 triggered")
                            print(word2)
                '''
				Rule 5 (with sub): I like the lens of this camera
			'''
            if ('NN' in word_to_pos[i][index_to_word[(word)]]
                    and word_to_dep[i][word] == 'dobj'):
                if (__name__ == '__main__'):
                    print("Rule 5 triggered")
                    print(word)
                try:
                    concept_info = sn.concept((word))
                    print("present in senticnet")
                except KeyError:
                    print("Yay")
                    aspects.append(word)
            '''
				Rule 6 : I like the beauty of the screen.
				Check if senticnet condition should be added
			'''
            if ('NN' in word_to_pos[i][index_to_word[(word)]]
                    and word_to_dep[i][word] == 'dobj'):
                try:
                    concept_info = sn.concept((word))
                    aspects.append(word)
                    print("yay!")
                except KeyError:
                    print("oops, not there in SenticNet")
                for word2 in sent:
                    if (word2 != word and word_to_par[i][word2] == word and
                            'NN' in word_to_pos[i][index_to_word[(word2)]]):
                        aspects.append(word2)
                        if (__name__ == '__main__'):
                            print("Rule 6 triggered.")
                            print(word2)
            '''
				Rule 7 : I would like to comment on the camera of this phone. 
			
			'''
            if (word_to_dep[i][word] == 'xcomp'):
                try:
                    concept_info = sn.concept((word))
                    aspects.append(word)
                    print("yay!")
                except KeyError:
                    print("oops, not there in SenticNet")
                for child in word_to_child[i][word]:
                    if ('NN' in word_to_pos[i][index_to_word[child]]):
                        aspects.append(child)
                        if (__name__ == '__main__'):
                            print("Rule 7 triggered.")
                            print(word)
                            print(child)
            '''
				Rule 8 : The car is expensive.
			'''
            if (word_to_dep[i][word] == 'nsubj'):
                for word2 in sent:
                    if (word2 != word and word_to_dep[i][word2] == 'cop'
                            and word_to_par[i][word2] == word_to_par[i][word]):
                        aspects.append(word_to_par[i][word])
                        if (__name__ == '__main__'):
                            print("Rule 8 triggered")
                            print(word_to_par[i][word])
            '''			
				Rule 9 : The camera is nice.
			'''
            if (word_to_dep[i][word] == 'nsubj'
                    and 'NN' in word_to_pos[i][index_to_word[(word)]]):
                for word2 in sent:
                    if (word2 != word and word_to_dep[i][word2] == 'cop'
                            and word_to_par[i][word2] == word_to_par[i][word]):
                        aspects.append(word)
                        if (__name__ == '__main__'):
                            print("Rule 9 triggered")
                            print(word)
            '''
				Rule 10 : The phone is very lightweight to carry.
			'''
            if (word_to_dep[i][word] == 'cop'):
                for word2 in sent:
                    if (word2 != word
                            and 'VB' in word_to_pos[i][index_to_word[(word2)]]
                            and word_to_par[i][word] == word_to_par[i][word2]):
                        aspects.append(word2)
                        if (__name__ == '__main__'):
                            print("Rule 10 triggered.")
                            print(word2)
            '''
				Extracting mods of dobjs

			'''
            if (word_to_dep[i][word] == 'dobj'):
                for child in word_to_child[i][word]:
                    if ('mod' in word_to_dep[i][child] and 'JJ'
                            in word_to_pos[i][index_to_word[(child)]]):
                        aspects.append(child)
            '''
				Rule 11 : Checking for conjuctions
			'''
        for asp in aspects:
            for word in sent:
                if (word_to_dep[i][word] == 'conj'
                        and word_to_par[i][word] == asp):
                    aspects.append(word)
                    if (__name__ == '__main__'):
                        print("Rule conj triggered.")
                        print(word)

    finalIAC = set(aspects)
    finalIAC = [index_to_word[f] for f in finalIAC]
    finalIAC = [w for w in finalIAC if not w in stop_words]

    finalSenti = []
    for iac in finalIAC:
        try:
            concept_info = sn.concept((iac))
            finalSenti.append(iac)
        except KeyError:
            print("No word available for " + iac)

    return finalIAC, finalSenti
class Get_IAC():
    def __init__(self):
        self.col = ['Name', 'Brand', 'Price', 'Title', 'Score', 'Time', 'Text']
        self.sn = SenticNet('en')
        self.wordnet_lemmatizer = WordNetLemmatizer()

    def review_to_sentences(self, review):
        #     review = review.replace(',','.')
        review = review.replace('.', '. ')
        raw_sentences = sent_tokenize(review)
        return raw_sentences

    def InputData(self, input_path):
        self.dict_list = []
        if '.csv' in input_path:
            with open(input_path, 'r', encoding='utf8') as f:
                reader = csv.DictReader(f)
                for row in reader:
                    d = {i: row[i] for i in col}
                    self.dict_list.append(d)
        elif '.xlsx' in input_path:
            wb = load_workbook(input_path)
            sheet = wb.active
            count = 0
            for row in sheet.rows:
                if count == 0:
                    count += 1
                    continue
                d = {}
                name = 0
                for cell in row:
                    d[self.col[name]] = cell.value
                    name += 1
                self.dict_list.append(d)

        self.dict_list = [
            x for x in self.dict_list if x['Text'] != '' and x['Text'] != None
        ]
        self.sentences = []
        for i in range(len(self.dict_list)):
            for j in self.review_to_sentences(self.dict_list[i]['Text']):
                self.sentences.append(j)
        self.sentences = [x for x in self.sentences if len(x) >= 5]

    def GetIAC(self):
        self.nlp = StanfordCoreNLP(r'stanford-corenlp-full-2018-10-05')
        self.IAC = []
        for i in tqdm(self.sentences):
            dependency = self.nlp.dependency_parse(i)
            token = self.nlp.word_tokenize(i)
            if [x for x in dependency if 'compound' in x] != []:
                for j in [x for x in dependency if 'compound' in x]:
                    token[j[2] - 1] = token[j[2] - 1] + '-' + token[j[1] - 1]
                    token[j[1] - 1] = ''
                i = ' '.join(token)

            parse = self.nlp.parse(i)
            dependency = self.nlp.dependency_parse(i)
            pos = self.nlp.pos_tag(i)
            token = []
            for j in pos:
                wordnet_pos = self.get_wordnet_pos(j[1])
                token.append(
                    self.wordnet_lemmatizer.lemmatize(j[0].lower(),
                                                      pos=wordnet_pos))

            # subject noun relation
            if [x for x in dependency if 'nsubj' in x] != []:
                for j in self.Subject_Noun_Rule(parse, dependency, token, pos):
                    self.IAC.append(j)
            else:  # Non subject noun relation
                for j in self.Non_Subject_Noun_Rule(parse, dependency, token,
                                                    pos):
                    self.IAC.append(j)
        self.nlp.close()
        self.IAC = list(set(self.IAC))

    def get_wordnet_pos(self, treebank_tag):
        if treebank_tag.startswith('J'):
            return wn.ADJ
        elif treebank_tag.startswith('V'):
            return wn.VERB
        elif treebank_tag.startswith('N'):
            return wn.NOUN
        elif treebank_tag.startswith('R'):
            return wn.ADV
        else:
            return wn.NOUN

    # Additional Rule: 對等連接詞
    def Conj(self, index, dependency, token):
        IAC = []
        index = list(set(index))
        if [x for x in dependency if 'conj' in x] != []:
            conj = [x for x in dependency if 'conj' in x]
            for j in conj:
                if j[1] in index or j[2] in index:
                    if j[1] not in index:
                        IAC.append(token[j[1] - 1])
                        index.append(j[1])
                    if j[2] not in index:
                        IAC.append(token[j[2] - 1])
                        index.append(j[2])
        return IAC

    def Subject_Noun_Rule(self, parse, dependency, token, pos):
        be = ['is', 'was', 'am', 'are', 'were']
        adv_mod = [x for x in dependency if 'advmod' in x]
        adj_mod = [x for x in dependency if 'amod' in x]
        active_token = token[[x for x in dependency if 'nsubj' in x][0][2] -
                             1]  # 主詞

        result = []
        index = []
        if adv_mod != [] or adj_mod != []:
            A, B = self.Rule1(adv_mod, adj_mod, active_token, token)
            result += A
            index += B

        #  does not have auxiliary verb
        if any(k in token
               for k in be) == False and [x for x in pos if 'MD' in x] == []:
            A, B = self.Rule2(token, pos, dependency, active_token, adv_mod,
                              adj_mod)
            result += A
            index += B

            if [x for x in dependency if 'dobj' in x] != []:
                A, B = self.Rule3(dependency, token, pos)
                result += A
                index += B

            if [x for x in dependency if 'xcomp' in x] != []:
                A, B = self.Rule4(dependency, token, pos)
                result += A
                index += B

        if [x for x in dependency if 'cop' in x] != []:
            A, B = self.Rule5(dependency, pos, active_token, token)
            result += A
            index += B

        result += self.Conj(index, dependency, token)
        return list(set(result))

    # 3.3.3 Rule 1
    def Rule1(self, adv_mod, adj_mod, active_token, token):
        IAC = []
        index = []
        if adv_mod != []:
            for j in adv_mod:
                try:
                    concept = self.sn.concept(token[j[2] - 1])
                    IAC.append(token[j[2] - 1])
                    index.append(j[2])
                except:
                    a = 0
    #                 print(token[j[2]-1] + ' Not in SenticNet')
        if adj_mod != []:
            for j in adj_mod:
                try:
                    concept = self.sn.concept(token[j[2] - 1])
                    IAC.append(token[j[2] - 1])
                    index.append(j[2])
                except:
                    a = 0
    #                 print(token[j[2]-1] + ' Not in SenticNet')
        return IAC, index

    # 3.3.3 Rule 2-1

    def Rule2(self, token, pos, dependency, active_token, adv_mod, adj_mod):
        IAC = []
        index = []
        advcl = [x for x in dependency
                 if 'advcl' in x]  # adverbial clause modifier
        if advcl != []:
            for j in advcl:
                IAC.append(token[j[1] - 1])
                index.append(j[1])
                IAC.append(active_token)
                index.append([x for x in dependency if 'nsubj' in x][0][2])

        if adv_mod != []:
            for j in adv_mod:
                IAC.append(token[j[1] - 1])
                index.append(j[1])
                IAC.append(active_token)
                index.append([x for x in dependency if 'nsubj' in x][0][2])

        if adj_mod != []:
            for j in adj_mod:
                IAC.append(token[j[1] - 1])
                index.append(j[1])
                IAC.append(active_token)
                index.append([x for x in dependency if 'nsubj' in x][0][2])

        return IAC, index

    # 3.3.3 Rule 2-2 & 2-3
    def Rule3(self, dependency, token, pos):
        IAC = []
        index = []
        dobj = [x for x in dependency
                if 'dobj' in x]  #  open clausal complement
        for j in dobj:
            if pos[j[2] - 1][1] == 'NN':
                try:
                    # Rule 2-3
                    concept = self.sn.concept(token[j[2] - 1])
                    IAC.append(token[j[2] - 1])
                    index.append(j[2])
                    conj = []
                    conj.append(j[2])
                    if [x for x in dependency if 'conj' in x and j[2] in x
                        ] != []:
                        for i in [
                                x for x in dependency
                                if 'conj' in x and j[2] in x
                        ]:
                            conj.append(i[1])
                            conj.append(i[2])
                    conj = list(set(conj))
                    for i in conj:
                        t1 = i
                        connect = [x for x in dependency if t1 in x]
                        for k in connect:
                            if k[1] != t1:
                                if pos[k[1] - 1][1] == 'NN':
                                    IAC.append(token[k[1] - 1])
                                    index.append(k[1])
                            if k[2] != t1:
                                if pos[k[2] - 1][1] == 'NN':
                                    IAC.append(token[k[2] - 1])
                                    index.append(k[2])
                except:
                    # Rule 2-2
                    IAC.append(token[j[2] - 1])
                    index.append(j[2])
    #                 print(token[j[2]-1] + ' Not in SenticNet')
        return IAC, index

    # 3.3.3 Rule 2-4

    def Rule4(self, dependency, token, pos):
        IAC = []
        index = []
        xcomp = [x for x in dependency
                 if 'xcomp' in x]  #  open clausal complement
        for j in xcomp:
            try:
                concept = self.sn.concept(token[j[1] - 1] + '-' +
                                          token[j[2] - 1])
                IAC.append(token[j[1] - 1] + '-' + token[j[2] - 1])
            except:
                a = 0
    #             print(token[j[1]-1] + '-' + token[j[2]-1] + ' Not in SenticNet')
            t1 = j[2]
            connect = [x for x in dependency if t1 in x]
            for k in connect:
                if pos[k[2] - 1][1] == 'NN':
                    IAC.append(token[k[2] - 1])
                    index.append(k[2])
        return IAC, index

    # 3.3.3 Rule 3 & 3.3.3 Rule 4 & 3.3.3 Rule 5

    def Rule5(self, dependency, pos, active_token, token):
        IAC = []
        index = []
        cop = [x for x in dependency if 'cop' in x]  # copula
        # Rule 4
        if pos[[x for x in dependency if 'nsubj' in x][0][2] - 1][1] == 'NN':
            IAC.append(active_token)
            index.append([x for x in dependency if 'nsubj' in x][0][2])

        # Rule 3 & Rule 5
        for j in cop:
            # Rule 3
            conj = []
            #         if token[j[1]-1] in all_term:
            IAC.append(token[j[1] - 1])
            index.append(j[1])
            conj.append(j[1])
            if [x for x in dependency if 'conj' in x and j[1] in x] != []:
                for i in [x for x in dependency if 'conj' in x and j[1] in x]:
                    conj.append(i[1])
                    conj.append(i[2])

            # Rule 5
            conj = list(set(conj))
            for i in conj:
                t1 = i
                connect = [x for x in dependency if t1 in x]
                for k in connect:
                    if k[1] != t1:
                        if pos[k[1] - 1][1] == 'VB' or pos[k[1] -
                                                           1][1] == 'VV':
                            IAC.append(token[k[1] - 1])
                            index.append(k[1])
                            if token[t1 - 1] not in IAC:
                                IAC.append(token[t1 - 1])
                                index.append(t1)
                    if k[2] != t1:
                        if pos[k[2] - 1][1] == 'VB' or pos[k[2] -
                                                           1][1] == 'VV':
                            IAC.append(token[k[2] - 1])
                            index.append(k[2])
                            if token[t1 - 1] not in IAC:
                                IAC.append(token[t1 - 1])
                                index.append(t1)
        return IAC, index

    def Non_Subject_Noun_Rule(self, parse, dependency, token, pos):
        result = []
        index = []
        if [x for x in dependency if 'xcomp' in x] != []:
            A, B = self.Rule6(dependency, token)
            result += A
            index += B

        if [x for x in dependency if 'case' in x] != []:
            A, B = self.Rule7(dependency, pos, token)
            result += A
            index += B

        if [x for x in dependency if 'dobj' in x] != []:
            A, B = self.Rule8(dependency, token)
            result += A
            index += B

        result += self.Conj(index, dependency, token)
        return list(set(result))

    # 3.3.4 Rule 1
    def Rule6(self, dependency, token):
        IAC = []
        index = []
        xcomp = [x for x in dependency
                 if 'xcomp' in x]  #  open clausal complement
        for j in xcomp:
            #         if token[j[1]-1] in all_term:
            IAC.append(token[j[1] - 1])
            index.append(j[1])
        return IAC, index

    # 3.3.4 Rule 2
    def Rule7(self, dependency, pos, token):
        IAC = []
        index = []
        case = [x for x in dependency
                if 'case' in x]  #  a prepositional relation
        for j in case:
            if pos[j[1] - 1][1] == 'NN':
                connect = [
                    x for x in dependency if j[1] in x and 'mod' in x[0]
                ]
                for i in connect:
                    IAC.append(token[i[1] - 1])
                    IAC.append(token[i[2] - 1])
                    index.append(i[1])
                    index.append(i[2])
        return list(set(IAC)), list(set(index))

    # 3.3.4 Rule 3
    def Rule8(self, dependency, token):
        IAC = []
        index = []
        dobj = [x for x in dependency
                if 'dobj' in x]  #  a direct object relation
        for j in dobj:
            IAC.append(token[j[2] - 1])
            index.append(j[2])
        return IAC, index

    def Save(self, output_path):
        with open(output_path, 'wb') as f:
            pickle.dump(self.IAC, f)
def get_clues(text):
    text = text
    print("*--------(%s)-------------*" % (text))
    print(type(text))
    nlp = StanfordCoreNLP('http://localhost:9001')
    stop_words = set(stopwords.words('english'))
    '''
		Method to remove numbers appended at last
	'''
    dep_parse = nlp.annotate(text,
                             properties={
                                 'annotators': 'depparse',
                                 'outputFormat': 'json',
                                 'timeout': 10000,
                             })

    pos = nlp.annotate(text,
                       properties={
                           'annotators': 'lemma',
                           'outputFormat': 'json',
                           'timeout': 10000,
                       })

    sn = SenticNet()
    word_to_dep = [{} for i in range(len(dep_parse['sentences']))]
    word_to_par = [{} for i in range(len(dep_parse['sentences']))]
    word_to_pos = [{} for i in range(len(dep_parse['sentences']))]
    word_to_lemma = [{} for i in range(len(dep_parse['sentences']))]
    word_to_child = [{} for i in range(len(dep_parse['sentences']))]
    sents = [[] for i in range(len(dep_parse['sentences']))]
    index_to_word = {}
    aspect_result = [[] for i in range(len(dep_parse['sentences']))]
    '''
		Constructing dicts for maintaining the dependencies among words. 
	'''
    '''
		Appending each word by occurence number to maintain distinct word count
	'''
    print("********")
    for i, sent in enumerate(dep_parse['sentences']):
        for dep in sent['basicDependencies']:
            word_to_dep[i][dep['dependentGloss'] +
                           str(dep['dependent'])] = dep['dep']
            word_to_par[i][dep['dependentGloss'] +
                           str(dep['dependent'])] = dep['governorGloss'] + str(
                               dep['governor'])
            index_to_word[dep['dependentGloss'] +
                          str(dep['dependent'])] = dep['dependentGloss']

            if (dep['governorGloss'] + str(dep['governor'])
                    not in word_to_child[i]):
                word_to_child[i][dep['governorGloss'] +
                                 str(dep['governor'])] = []
            if (dep['dependentGloss'] + str(dep['dependent'])
                    not in word_to_child[i]):
                word_to_child[i][dep['dependentGloss'] +
                                 str(dep['dependent'])] = []
            word_to_child[i][dep['governorGloss'] +
                             str(dep['governor'])].append(
                                 dep['dependentGloss'] + str(dep['dependent']))
            sents[i].append(dep['dependentGloss'] + str(dep['dependent']))
        word_to_dep[i]['ROOT0'] = 'root'
        word_to_par[i]['ROOT0'] = 'root'

    for i, sent in enumerate(pos['sentences']):
        for pos_tagger in sent['tokens']:
            word_to_pos[i][pos_tagger['word']] = pos_tagger['pos']
            word_to_lemma[i][pos_tagger['word']] = pos_tagger['lemma']
        word_to_pos[i]['ROOT'] = 'root'
        word_to_lemma[i]['ROOT'] = 'root'
    '''
		Displaying the deps
	'''

    ##Implemeting rules to extract aspects
    for i, sent in enumerate(sents):
        if (__name__ == '__main__'):
            print(word_to_dep[i], word_to_par[i], word_to_pos[i],
                  word_to_lemma[i])
            print("Children==>")
            print(word_to_child[i])

    for i, sent in enumerate(sents):
        token_t = word_to_child[i]['ROOT0'][0]
        is_sub = False
        token_h = None
        for child in word_to_child[i][token_t]:
            if 'subj' in word_to_dep[i][child]:
                is_sub = True
                token_h = child

        #If subject noun relationship present
        if is_sub:
            """
				Rule 0: if any adv or adj modifies the token t.

			"""
            for child in word_to_child[i][token_t]:
                if ('amod' in word_to_dep[i][child]
                        or 'advmod' in word_to_dep[i][child]):
                    try:
                        concept_info = sn.concept(index_to_word[child])
                        aspect_result[i].append(token_t)
                        if __name__ == '__main__':
                            print("Rule 1 triggered.")
                            print("present in senticnet")
                    except KeyError:
                        print("OOps")
            """
				Rule 1: The battery lasts little.

			"""
            for child in word_to_child[i][token_t]:
                if (word_to_dep[i][child] == 'advmod' or word_to_dep[i][child]
                        == 'amod' or word_to_dep[i][child] == 'advcl') and (
                            'VB' in word_to_pos[i][index_to_word[token_t]]):
                    aspect_result[i].append(token_t)
                    aspect_result[i].append(token_h)
                    if __name__ == '__main__':
                        print("Rule 1 triggered.")
                        print(token_t)
                        print(token_h)
            """
				Rule 2: I like the beauty of the screen (and I like the lens of this camera). 

			"""
            for child in word_to_child[i][token_t]:
                if (word_to_dep[i][child] == 'dobj'
                        and 'NN' in word_to_pos[i][index_to_word[child]]):
                    aspect_result[i].append(child)
                    if __name__ == '__main__':
                        print(child)
                    try:
                        concept_info = sn.concept(index_to_word[child])
                        if __name__ == '__main__':
                            print("Rule 2 triggered")
                        for grandchild in word_to_child[i][child]:
                            if ('NN' in word_to_pos[i][
                                    index_to_word[grandchild]]):
                                aspect_result[i].append(grandchild)
                                print(grandchild)
                    except KeyError:
                        print("OOps")
            """
				Rule 3: I would like to comment on the camera of this phone.
	
			"""
            for child in word_to_child[i][token_t]:
                if (word_to_dep[i][child] == 'xcomp'):
                    try:
                        sn.concept(index_to_word[child])
                        aspect_result[i].append(child)
                        if __name__ == '__main__':
                            print(child)
                    except KeyError:
                        print("OOps")
                    for grandchild in word_to_child[i][child]:
                        if ('NN' in word_to_pos[i][index_to_word[grandchild]]):
                            aspect_result[i].append(grandchild)
                            if __name__ == '__main__':
                                print(grandchild)
                                print("Rule 3 triggered.")
            """
				Rule 4: The car is expensive.

			"""
            for child in word_to_child[i][token_t]:
                if (word_to_dep[i][child] == 'cop'):
                    try:
                        sn.concept(word_to_lemma[i][index_to_word[token_t]])
                        aspect_result[i].append(token_t)
                        if __name__ == '__main__':
                            print("Rule 4 triggered")
                            print(token_t)
                    except KeyError:
                        pass
            """
				Rule 5: The camera is nice

			"""
            for child in word_to_child[i][token_t]:
                if (word_to_dep[i][child] == 'cop'
                        and 'NN' in word_to_pos[i][index_to_word[token_h]]):
                    aspect_result[i].append(token_h)
                    if __name__ == '__main__':
                        print("Rule 5 triggered.")
                        print(token_h)
            """
				Rule 6: 

			"""
            for child in word_to_child[i][token_t]:
                if (word_to_dep[i][child] == 'cop'):
                    for child2 in word_to_child[i][token_t]:
                        if (child != child2 and 'VB'
                                in word_to_pos[i][index_to_word[child2]]):
                            try:
                                sn.concept(index_to_word[token_t])
                                sn.concept(index_to_word[child2])
                                aspect_result[i].append(token_t)
                                aspect_result[i].append(child2)
                                if __name__ == '__main__':
                                    print("rule 6 trigg")
                                    print(token_t)
                                    print(child2)
                            except KeyError:
                                pass
        else:
            """
				Rule 7:Very big to hold.

			"""
            for word in sent:
                if ('RB' in word_to_pos[i][index_to_word[word]]
                        or 'JJ' in word_to_pos[i][index_to_word[word]]):

                    for child in word_to_child[i][word]:
                        if (word_to_dep[i][child] == 'xcomp'
                                or word_to_dep[i][child] == 'ccomp'):
                            aspect_result[i].append(word)
                            if __name__ == '__main__':
                                print("Rule 7 triggered")
                                print(word)
            """
				Rule 8: Love the sleekness of the player.
			"""
            for word in sent:
                for child in word_to_child[i][word]:
                    if ('NN' in word_to_pos[i][index_to_word[child]]
                            and word_to_dep[i][child] == 'nmod'):
                        for grandchild in word_to_child[i][child]:
                            if ('IN' in word_to_pos[i][
                                    index_to_word[grandchild]]):
                                aspect_result[i].append(word)
                                aspect_result[i].append(child)
                                if __name__ == '__main__':
                                    print(word)
                                    print(child)
                                    print("Rule 8 triggered.")
            """
				Rule 9: Not to mention the price of the phone.

			"""
            for word in sent:
                for child in word_to_child[i][word]:
                    if (word_to_dep[i][child] == 'dobj'):
                        aspect_result[i].append(child)
                        if __name__ == '__main__':
                            print(child)
                            print("Rule 9 triggered")
            '''
				Rule 11 : Checking for conjuctions
			'''
        for asp in aspect_result[i]:
            for word in sent:
                if (word_to_dep[i][word] == 'conj'
                        and word_to_par[i][word] == asp):
                    aspect_result[i].append(word)
                    if (__name__ == '__main__'):
                        print("Rule conj triggered.")
                        print(word)

    finalIAC = [set(aspect_result[i]) for i in range(len(sents))]
    finalIAC = [[index_to_word[w] for w in finalIAC[i]]
                for i in range(len(sents))]

    print(finalIAC)
    singleFinalIAC = []
    for i in range(len(sents)):
        for w in finalIAC[i]:
            if w not in stop_words:
                singleFinalIAC.append(w)
    print(singleFinalIAC)

    finalSenti = []
    for iac in singleFinalIAC:
        try:
            concept_info = sn.concept((iac))
            finalSenti.append(iac)
        except KeyError:
            print("No word available for " + iac)

    return singleFinalIAC, finalSenti
Beispiel #21
0
# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""
import nltk
nltk.download()

import os

from senticnet.senticnet import SenticNet
sn = SenticNet()
sn.concept('')


def fun1(d):
    try:
        from senticnet.senticnet import SenticNet
        sn = SenticNet()
        sn.semantics(d)
        return True
    except KeyError as error:
        return False


fun1('day')

from nltk.corpus import wordnet
sk = wordnet.synset('ssd')
Beispiel #22
0
from senticnet.senticnet import SenticNet

sn = SenticNet('ru')

word = input('Введите ваш комментарий(например "как дела"): ')

lst = word.split()

#concept_info = sn.concept(word)
#polarity_value = sn.polarity_value(word)
#polarity_intense = sn.polarity_intense(word)
#moodtags = sn.moodtags(word)
#semantics = sn.semantics(word)

print(list(map(lambda x: sn.sentics(x), lst)))
pop = input(" ")
Beispiel #23
0
import bagofconcepts as boc

# Each line of corpus must be equivalent to each document of the corpus
#boc_model=boc.BOCModel(doc_path="input corpus path")
boc_model = boc.BOCModel('text.txt')

#boc_model.context = text

# output can be saved with save_path parameter
boc_matrix, word2concept_list, idx2word_converter = boc_model.fit()

# SenitcNet lexicon lookup
from senticnet.senticnet import SenticNet

sn = SenticNet()

concept_info = sn.concept(text)
polarity_value = sn.polarity_value(text)
polarity_intense = sn.polarity_intense(text)
moodtags = sn.moodtags(text)
semantics = sn.semantics(text)
sentics = sn.sentics(text)

print('==================================')
print('test: ', text)
print('concept_info: ', concept_info)
print('polarity_value: ', polarity_value)
print('polarity_intense: ', polarity_intense)
print('moodtags: ', moodtags)
print('semantics: ', semantics)
Beispiel #24
0
def data_Preprocessing(data, data_test, n_of_words, polarity_threshold):
    Reviews = data["Content"]
    #Check if all char are ASCII
    # If we need another method for Encode/Decode there is string.printable method
    for i in range(0, len(Reviews)):
        x = Reviews.iloc[i].encode('ascii', errors='ignore').decode()

    # Set all the content to lower case
    Reviews = Reviews.apply(lambda row: row.lower())

    # Add to the follow variable the characters that you want to delete
    chars_to_del = "[" + string.punctuation + string.digits + "]"
    # Delete all the chars in "chars_to_del" from each row of the dataframe
    Reviews = Reviews.apply(lambda row: re.sub(chars_to_del, '', row))
    # Tokenize every single words of the data content
    Token_Reviews = Reviews.apply(lambda row: nltk.word_tokenize(row))

    # Generating the list "stop" of element TO BE REMOVED from the sentences (stopwords, numbers and punctuations)
    stop = stopwords.words("english")
    # Remove all the words in the variable "stop"
    Filtered_Review = Token_Reviews.apply(
        lambda row: [w for w in row if not w in stop])

    # Stemming the data's content
    # Stemming the Filtered sentence, some stemmed words:
    # http://snowball.tartarus.org/algorithms/english/stemmer.html
    ps = PorterStemmer()
    for idx in range(0, len(Filtered_Review)):
        Stemmed_Review_temp = []
        for word in Filtered_Review.iloc[i]:
            Stemmed_Review_temp.append(ps.stem(word))
        Filtered_Review.iloc[i] = Stemmed_Review_temp

    # Terms choosing: most common word
    sn = SenticNet()

    Filtered_Review_List = list(itertools.chain.from_iterable(Filtered_Review))
    Words_Frquency = FreqDist(Filtered_Review_List)
    Most_Common_Words_Frequency = Words_Frquency.most_common(n_of_words)

    Most_Common_Words = []
    for i in range(0, n_of_words):
        Most_Common_Words.append(Most_Common_Words_Frequency[i][0])

    index = 1
    words_and_polarity = pd.DataFrame(columns=["Word", "Polarity"])
    Selected_Words = []
    # Terms polarity
    for word in Most_Common_Words:
        try:
            temp = sn.polarity_intense(word)
            if (float(temp) > polarity_threshold
                    or float(temp) < -(polarity_threshold)):
                words_and_polarity.loc[index] = [word, float(temp)]
                index = index + 1
                Selected_Words.append(word)
        except Exception:
            continue
    # Decomment if you want to recomputer the selected words and their polarity
    #words_and_polarity.to_csv("Words_and_Polarity.csv", sep=",")

    return data, data_test
Beispiel #25
0
# for w in Dict.keys():
#     if w not in porvalis and w not in slangwords and w not in kaggeleSentiment:
#         print(w)
#         c = c + 1


# print(c)
def Precision(tp, fp):
    return tp / (tp + fp)


def Recall(tp, fn):
    return tp / (tp + fn)


sn = SenticNet()
zeroSen = 0
tp = 0
tn = 0
fp = 0
fn = 0
actT = 0
with open("Dataset.pickle", "rb") as handle:
    pyDS = pickle.load(handle)
    for doc in pyDS.DocList:
        totSen = 0
        for w in doc.TermList:
            try:
                sen = sn.polarity_intense(w)
            except KeyError:
                sen = 0
Beispiel #26
0
import stanza as st
import csv
import textstat
import numpy as np
import matplotlib.pyplot as plt
import random
import copy
from gensim.models import KeyedVectors
from senticnet.senticnet import SenticNet
sn = SenticNet()

import language_tool_python
tool = language_tool_python.LanguageTool('en-US')

from transformers import pipeline
from wordhoard import antonyms,synonyms
from nltk import tokenize
from nltk.corpus import wordnet
from nltk.tokenize.treebank import TreebankWordDetokenizer
nlp = pipeline("fill-mask",model="bert-large-uncased",tokenizer="bert-large-uncased")


surname = np.load('surname.npy', allow_pickle=True)
malenames = np.load('male.npy', allow_pickle=True)
femalenames = np.load('female.npy', allow_pickle=True)
unsex = np.load('unsex.npy', allow_pickle=True)
boy = np.load('boy.npy', allow_pickle=True)
girl  = np.load('girl.npy', allow_pickle=True)
gender_word=[['boy', 'girl'], ['boys', 'girls'], ['nephew', 'niece'], ['brother', 'sister'], ['brothers', 'sisters'],['boyfriend', 'girlfriend'],
             ['dad', 'mom'], ['father', 'mother'], ['grandfather', 'grandmother'], ['grandpa', 'grandma'], ['grandson', 'granddaughter'], ['male','female'],
             ['groom', 'bride'], ['husband', 'wife'], ['king', 'queen'], ['man', 'woman'],['men','women'], ['policeman', 'policewoman'], ['prince', 'princess'],
Beispiel #27
0
from senticnet.senticnet import SenticNet

sn = SenticNet()
print("polarity value:", sn.polarity_value("love"))
print("polarity intense:", sn.polarity_intense("love"))
print("moodtags:", ", ".join(sn.moodtags("love")))
print("semantics:", ", ".join(sn.semantics("love")))
print("\n".join([key + ": " + str(value) for key, value in sn.sentics("love").items()]))
# Классификация текстов по 5 шкалам согласно базе SenticNet, оценка XGBoost

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from senticnet.senticnet import SenticNet
from nltk.stem import WordNetLemmatizer
from sklearn import metrics
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

df = pd.read_csv('preprocessed.csv')

lemmatizer = WordNetLemmatizer()    # преобразование слов в исходную форму
sn = SenticNet()   # база данных, содержащая классификацию слов и выражений по значению и настроению


# Функция, возвращающая оценки введенного текста по 5 шкалам базы SenticNet: polarity intensity,
# pleasantness, attention, sensitivity и aptitude. Оценка формируется как сумма оценок
# всех включенных в базу слов и выражений из текста и нормируется на кол-во слов в тексте
def SN(data):
    # Преобразование текста в вектор, формирование словаря слов и словосочетаний длинной до 3 слов включительно
    vectorizer = CountVectorizer(analyzer='word', ngram_range=(1,3))
    vec = vectorizer.fit_transform([data]).todense()
    k = 0
    polarity_intense = sentics_pleasant = sentics_attention = sentics_sense = sentics_aptitude = 0
    for i in vectorizer.vocabulary_.keys():
        try:  # Попытка найти i-ое слово/выражение в базе
            num_repetitions = vec[0, vectorizer.vocabulary_[i]]
            polarity_intense += (float(sn.polarity_intense(i)) * num_repetitions)
            sentics_pleasant += (float(sn.sentics(i)['pleasantness']) * num_repetitions)
 def __init__(self):
     self.col = ['Name', 'Brand', 'Price', 'Title', 'Score', 'Time', 'Text']
     self.sn = SenticNet('en')
     self.wordnet_lemmatizer = WordNetLemmatizer()
class Sarcasm:
    def __init__(self, *args, **kwargs):
        # loading necessaries
        self.nlp = spacy.load("en_core_web_sm")
        self.senti = SenticNet()
        self.sid = SentimentIntensityAnalyzer()
        #loading dataset
        self.df = pd.read_json("./Sarcasm_Headlines_Dataset.json", lines=True)
        self.df = self.df[:15000]
        self.df.drop(columns="article_link",
                     inplace=True)  #dropping unnessary attribute
        #storing nlp data in headlines variable
        self.headlines = []
        self.uni_gram = set()
        self.uni_feature = []
        self.y_ = []
        for i in self.df['headline']:
            self.headlines.append(self.nlp(i))

    def w_score(self, w):
        """
        input: word
        Finding word score based on nltk's vader_lexicon sentiment analysis
        and Senticnet sentiment analysis
        """
        ss = self.sid.polarity_scores(w)['compound']
        try:
            sn = self.senti.polarity_intense(w)
            sn = float(sn)
            if ss == 0:
                return sn
            else:
                return (sn + ss) / 2

        except:
            #not found in sn find for only ss or concepts
            if ss != 0:
                return ss
            elif ss == 0:  #find for the concepts
                return ss

    def sentimentScore(self, sent):
        """
        input: sentence
        Return if contradiction occurs 
        or not 
        """
        sum_pos_score = 0
        sum_neg_score = 0
        for w in sent:
            if w.lemma_ == '-PRON-':
                score = self.w_score(w.text)
            else:
                score = self.w_score(w.lemma_)
            if score > 0:
                sum_pos_score += score
            else:
                sum_neg_score += score
        if sum_pos_score > 0 and sum_neg_score < 0:
            return ("contradict", sum_pos_score, sum_neg_score)
        else:
            return ("anything", sum_pos_score, sum_neg_score)

    def coherence(self, s1, s2):
        '''
        Input sentence1, sentence2 using nlp
        Rule1:- Pronoun match feature - including reflexive, personal, and possessive pronouns.
        Rule2:- String match feature - ignore stop words
        Rule3:- Definite noun phrase - w2 starts with the word 'the'
        Rule4:- Demonstrative noun phrase feature - w2 starts with the "this", "that", "these" and "those"
        Rule5:- Both proper names features - w1 and w2 are both named entities
        '''
        # subject and object of s1 and s2
        sub1 = ""
        sub2 = ""
        obj1 = ""
        obj2 = ""

        for i in s1.noun_chunks:
            if i.root.dep_ == 'nsubj':
                sub1 = i.root
            if i.root.dep == 'pobj':
                obj1 = i.root
        for j in s2.noun_chunks:
            if j.root.dep_ == 'nsubj':
                if type(sub1) != type(
                        "") and sub1.pos_ == 'PRON' and j.root.pos_ == 'PRON':
                    if sub1.text.lower() == j.root.text.lower():
                        return "coherent"
                # rule 4:-

                if j[0].text.lower() == 'the':
                    return "coherent"
                if j[0].text.lower() in ['this', 'that', 'these', 'those']:
                    return "coherent"
            if j.root.dep_ == 'pobj':
                if type(obj1) != type(
                        "") and obj1.pos_ == 'PRON' and j.root.pos_ == 'PRON':
                    if obj1.text.lower() == j.root.text.lower():
                        return "coherent"
        return "Not coherent"

    def to_string_from_list(self, l):
        st = ""
        for i in l:
            st += i + ' '
        return st.rstrip()

    def n_gram_feature(self, text, n):
        """
        Input: headline in nlp
        Finding n grams of given text
        """
        one_list = []
        for tok in text:
            if not tok.is_punct:
                if tok.lemma_ != '-PRON-':
                    one_list.append(tok.lemma_)
                else:
                    one_list.append(tok.text)
        try:
            one_list.remove(' ')
        except:
            pass
        #convert it to n-gram
        _list = []
        for i, t in enumerate(one_list):
            if len(one_list[i:n + i]) >= n:
                _list.append(self.to_string_from_list(one_list[i:n + i]))
        return set(_list)

    def contradiction_feature(self, headline):
        '''
        Contradiction feature 
        input: nlp processed 
        '''
        #for single sentence headline
        if len(list(headline.sents)) == 1:
            if self.sentimentScore(headline)[0] == 'contradict':
                return (1, 0)
            else:
                return (0, 0)
        #for multisentence headline
        else:
            if self.sentimentScore(headline)[0] == 'contradict':
                sent = list(headline.sents)
                i = 0
                while i < len(sent) - 1:
                    # number of sentece
                    if self.coherence(sent[i], sent[i + 1]) is not "coherent":
                        return (0, 0)
                    i += 1
                return (0, 1)

            else:
                return (0, 0)

    def baseline3(self):
        '''
        Use of sentiment analysis + coherence
        '''
        predictions = []
        for i in self.headlines:
            get = self.contradiction_feature(i)
            if get == (1, 0) or get == (0, 1):
                predictions.append(1)
            else:
                predictions.append(0)
        return (confusion_matrix(self.df['is_sarcastic'], predictions),
                classification_report(self.df['is_sarcastic'], predictions),
                accuracy_score(self.df['is_sarcastic'], predictions))

    def baseline1(self):
        predictions = []
        for p in self.headlines:
            co, _, _ = self.sentimentScore(p)
            if (co == 'contradict'):
                predictions.append(1)
            else:
                predictions.append(0)
        return (confusion_matrix(self.df['is_sarcastic'], predictions),
                classification_report(self.df['is_sarcastic'], predictions),
                accuracy_score(self.df['is_sarcastic'], predictions))

    def uni_gram_features(self, start, end, n=1):
        self.uni_gram = list(self.uni_gram)
        self.uni_gram = sorted(self.uni_gram)
        index = start
        for p in self.headlines[start:end]:
            uni = [0 for i in range(len(self.uni_gram))]
            for i, j in enumerate(p):
                temp = []  #temp
                if len(p[i:n + i]) >= n:
                    for k in range(n):

                        if p[i + k].lemma_ != '-PRON-':
                            temp.append(p[i + k].lemma_)
                        else:
                            temp.append(p[i + k].text)

                    temp = self.to_string_from_list(temp)
                    if temp in self.uni_gram:
                        uni[self.uni_gram.index(temp)] = 1
            self.y_.append(self.df['is_sarcastic'][index])
            index += 1
            self.uni_feature.append(uni)

    def baseline2(self, n=1):
        #unigram features
        self.uni_gram = set()
        self.uni_feature = []
        self.y_ = []
        for p in self.headlines:
            self.uni_gram = self.uni_gram.union(self.n_gram_feature(p, n))

        #now find
        length = len(self.headlines)
        t1 = threading.Thread(target=self.uni_gram_features,
                              name='t1',
                              args=(0, int(length / 4), n))
        t2 = threading.Thread(target=self.uni_gram_features,
                              name='t2',
                              args=(int(length / 4), int(length / 2), n))
        t3 = threading.Thread(target=self.uni_gram_features,
                              name='t3',
                              args=(int(length / 2), int(3 * length / 4), n))
        t4 = threading.Thread(target=self.uni_gram_features,
                              name='t4',
                              args=(int(3 * length / 4), length, n))
        t1.daemon = True
        t2.daemon = True
        t3.daemon = True
        t4.daemon = True
        st = time.time()
        t1.start()
        t2.start()
        t3.start()
        t4.start()
        t1.join()
        t2.join()
        t3.join()
        t4.join()
        print(f'time taken: {time.time()-st}')
        X_train, X_test, y_train, y_test = train_test_split(self.uni_feature,
                                                            self.y_,
                                                            test_size=0.33,
                                                            random_state=42)
        return self.findLINEARSVCResult(X_train, X_test, y_train, y_test)

    def findLINEARSVCResult(self, X_train, X_test, y_train, y_test):
        '''
         Training data using LinearSVC model
        '''
        svc_model = LinearSVC()
        svc_model.fit(X_train, y_train)
        predictions = svc_model.predict(X_test)
        return (confusion_matrix(y_test, predictions),
                classification_report(y_test, predictions),
                accuracy_score(y_test, predictions))