def doc_sentiment(data): """ """ #Call SenticNet module sn = SenticNet() #Create positive and negative variables total_sentiment = 0 #Calculate sentiment for all words in document for i in range(len(data)): #If words don't exist in SenticNet vocabulary they will return an error #We treat these words as if they have a sentiment of 0 try: #Calculate sentiment of word sentiment = sn.polarity_value(data[i]) #Update total sentiment total_sentiment += float(sentiment) except: None try: #If total sentiment = 0 division errors will occur #Calculate average sentiment for the document avg_sentiment = total_sentiment/len(data) except: avg_sentiment = 0 if avg_sentiment >= 0: output = 1 else: output = 0 return output
def Terms_Chooser(data, n_of_words, polarity_threshold): sn = SenticNet() #choosing words data["Content"] = data["Content"].apply(lambda row: nltk.word_tokenize(row)) lista = np.array(data["Content"].values.tolist()) lista = list(itertools.chain.from_iterable(lista)) FD = FreqDist(lista) MC = FD.most_common(n_of_words) common_words = [] for i in range (0,n_of_words): common_words.append(MC[i][0]) polarity = list() words = list() for x in common_words: try: temp = sn.polarity_intense(x) if (float(temp) > polarity_threshold or float(temp) < -(polarity_threshold)): polarity.append(temp) words.append(x) except Exception: continue return words
class Dictionary: def __init__(self): self.sn = SenticNet() """ Input : String Output : "positive" or "negative" """ def get_word_polarity(self, word, log=True): value = "empty" try: value = self.sn.polarity_value(word.lower()) except: if log: print('An error occurred. Word: ' + word + ' is not known.') return value """ Input : String Output : Int [-1 : 1] """ def get_word_polarity_numerical_value(self, word, log=True): value = "empty" try: value = self.sn.polarity_intense(word.lower()) except: if log: print('An error occurred. Word: ' + word + ' is not known.') return value
def senticnet(text): """ Returns a list obtained from SenticNet with the following four features normalized: [pleasantness_value, attention_value, sensiivity_value, aptitude_value] :param text: input text pre-processed by Spacy :return: a list with the SenticNet features averaged for all the words in text """ list_features = [0] * 4 sn = SenticNet() count_words = 0 for token in text: try: concept_info = sn.concept(token) list_features[0] += float(concept_info['sentics']['pleasantness']) list_features[1] += float(concept_info['sentics']['attention']) list_features[2] += float(concept_info['sentics']['sensitivity']) list_features[3] += float(concept_info['sentics']['aptitude']) count_words += 1 except KeyError: pass if count_words != 0: list_features = [feature / count_words for feature in list_features] return list_features
def getSentics(self, word): senticsAndItensity = [] sn = SenticNet('en') try: sentics = sn.sentics(word) polarity_intensity = sn.polarity_intense(word) # print(sentics) # print(sentics['pleasantness']) # print(sentics['attention']) # print(sentics['sensitivity']) # print(sentics['aptitude']) # print(polarity_intensity) senticsAndItensity.append(float(sentics['pleasantness'])) senticsAndItensity.append(float(sentics['attention'])) senticsAndItensity.append(float(sentics['sensitivity'])) senticsAndItensity.append(float(sentics['aptitude'])) senticsAndItensity.append(float(polarity_intensity)) return senticsAndItensity except Exception as e: defaultsentics = [0.0, 0.0, 0.0, 0.0, 0.0] return defaultsentics # ##TESTING AREA # yas = SenticValuer() # print(yas.getSentics("awkward"))
def sem(d): try: sn = SenticNet() sn.semantics(d) return True except KeyError: return False
def fun1(d): try: from senticnet.senticnet import SenticNet sn = SenticNet() sn.semantics(d) return True except KeyError as error: return False
def get_emotions(tokens): from senticnet.senticnet import SenticNet result = {} sn = SenticNet() for token in tokens: moodtags = "" if token in sn.data: moodtags = sn.moodtags(token) print(token, moodtags) #TODO return result
def __init__(self): print("Start SenticNet - Sentiment Analysis") self.sp = Support() self.sn = SenticNet() self.corpus = self.sp.import_corpus_bank() self.terminology = self.sp.import_bank_terminology(filename='bank_terminology') self.data, self.label = self.sp.process_data(filename='bank_message', size_msg=3, clean=True, replace_text=True, stemmed=None, lemmatize=None, spelling=None)
def getMaxSum_senti(text): wnl = WordNetLemmatizer() sn = SenticNet() sentences = nltk.sent_tokenize(text) text_sentiAvg = 0 sentence_maxSenti = 0 for index in range(len(sentences)): sentence = sentences[index].strip() sentence = sentence[0:-1] assert '.' not in sentence words = nltk.word_tokenize(sentence.lower()) pos_tags = nltk.pos_tag(words) sentence_sentiSum = getSentenceSentiSum(pos_tags, wnl, sn) # print sentence_sentiSum, if sentence_sentiSum > sentence_maxSenti: sentence_maxSenti = sentence_sentiSum text_sentiAvg += sentence_sentiSum text_sentiAvg = text_sentiAvg / len(sentences) text_sentiAvg = round(text_sentiAvg, 6) sentence_maxSenti = round(sentence_maxSenti, 6) return text_sentiAvg, sentence_maxSenti
class SenticNets(BaseEstimator, TransformerMixin): def __init__(self, vocab): self.vocab = vocab self.X_width = len(vocab) self.sn = SenticNet() def fit(self, X): return self def vector(self, X): X = X.split(' ') zeros = csr_matrix((1, self.X_width)) for word in range(len(X)): if not X[word] in self.vocab: continue try: score = self.sn.polarity_value(X[word]) except KeyError: continue zeros[0, self.vocab[X[word]]] = score return zeros def transform(self, X): self.zeros = csr_matrix((0, self.X_width)) self.X_length = X.shape[0] for i in range(self.X_length): self.zeros = vstack([self.zeros, self.vector(X[i])]) return self.zeros def fit_transform(self, X, y=None): self.fit(X) self.transform(X) return self.zeros
def __init__(self, *args, **kwargs): # loading necessaries self.nlp = spacy.load("en_core_web_sm") self.senti = SenticNet() self.sid = SentimentIntensityAnalyzer() #loading dataset self.df = pd.read_json("./Sarcasm_Headlines_Dataset.json", lines=True) self.df = self.df[:15000] self.df.drop(columns="article_link", inplace=True) #dropping unnessary attribute #storing nlp data in headlines variable self.headlines = [] self.uni_gram = set() self.uni_feature = [] self.y_ = [] for i in self.df['headline']: self.headlines.append(self.nlp(i))
def sentiment_avg(self, text): sn = SenticNet('pt') list_polarity = [] qtd_words = len(text) temp = text.split() avg_n = 0 for i in range(len(temp)): try: polarity_value = sn.polarity_value( self.treatment_string(temp[i])) list_polarity.append(polarity_value) except: qtd_words -= 1 i += 1 avg_n = self.avg(list_polarity, qtd_words) if avg_n > 0.003 or avg_n < -0.003: return True else: return False
class SenticNetSA: def __init__(self): print("Start SenticNet - Sentiment Analysis") self.sp = Support() self.sn = SenticNet() self.corpus = self.sp.import_corpus_bank() self.terminology = self.sp.import_bank_terminology(filename='bank_terminology') self.data, self.label = self.sp.process_data(filename='bank_message', size_msg=3, clean=True, replace_text=True, stemmed=None, lemmatize=None, spelling=None) def baseline(self): TP = 0 FP = 0 FN = 0 x_train, x_test, y_train, y_test = train_test_split(self.data, self.label, test_size=0.20, random_state=1000) for i in range (0, len(x_train)): msg = str(x_train[i]) value = float(y_train[i]) result = self.sn.message_concept(msg) polarity_value = float(result['polarity_value']) polarity_value = 0.0 if polarity_value < 0.10 or polarity_value > -0.1 else polarity_value if value == polarity_value: TP += 1 else: FP += 1 if value == 1 and (polarity_value == 0.0 or polarity_value == -1.0): FN += 1 elif value == 0.0 and (polarity_value == 1 or polarity_value == -1.0): FN += 1 elif value == -1.0 and (polarity_value == 0.0 or polarity_value == 1.0): FN += 1 precision = TP/(TP + FP) recall = TP / (TP + FN) f1 = 2*((precision*recall) / (precision + recall)) print("f1-score : {}%".format(round(f1 * 100, 2)))
# untuk mencari frekuensi kemunculan kata (menggunakan list asli, bukan list kata unik) from nltk.probability import FreqDist fdist = FreqDist(ALL_filtered_sentence) print("fdist : {}".format(fdist)) print(fdist.most_common(200)) # ------------------------------------------------------------SENTICNET from MdCek.DBRepository.WordList_sentic_Repository import WordList_sentic_Repository from MdCek.Model.WordList_sentic import WordList_sentic import pprint from senticnet.senticnet import SenticNet wordSentic = WordList_sentic_Repository() sn = SenticNet() print("-----------------------------------------membaca wordlist depresion yang sudah ada dari NLTK dan textblob") # EXISTING WORDLIST NEGATIVE existWordlistDepression = wordList_depRepo.read() lemmaOfExistWordlist = [] conceptExist = [] conceptExistNegative = [] objsConceptExistNegative = [] conceptExistPositive = [] objsConceptExistPositive = [] sisa = 0 # print("Jumlah existing wordlist : {} ".format(len(existWordlistDepression)))
document_term_matrix_idf.head(10) # word cloud # wordcloud using TFIDF of words, some of the docs some words keep repeating in all the docs, (Science) It works as log functions to normalize the words words = dict(document_term_matrix_idf.apply( sum, axis=0)) ## this needs an dictionary object wordcloud = WordCloud( max_font_size=40, max_words=50, background_color="white").fit_words( words) # fit_words() is used to plot wordcloud using dictionary. plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show() ##### positive and negative words using user-built lexicons, plotting their wordclouds. sn = SenticNet() positive_words = [] negative_words = [] for word in vectorizer.get_feature_names(): if word in sn.data: if sn.polarity_value(word) == 'positive': positive_words.append(word) if sn.polarity_value(word) == 'negative': negative_words.append(word) len(positive_words) len(negative_words) positive_words = dict(document_term_matrix[positive_words].apply(sum, axis=0)) negative_words = dict(document_term_matrix[negative_words].apply(sum, axis=0))
for _, records in groupby(sorted(lst, key=keyprop), keyprop) ] a = [{'time': '25 APR', 'total': 10, 'high': 10}, {'time': '26 APR', 'total': 5, 'high': 5}] b = [{'time': '24 APR', 'total': 10, 'high': 10}, {'time': '26 APR', 'total': 15, 'high': 5}] merger = merge_list_of_records_by('time', add) hasil_merge = merger(a+b) print(hasil_merge) print("sinonim with thesaurus==================================================================") # from PyDictionary import PyDictionary # # dictionary = PyDictionary() # print(dictionary.synonym("good")) from thesaurus import Word w = Word('suicidal') syn = w.synonyms() print(syn) sn = SenticNet() try: concept_info_sinonim = sn.concept("suicidal") print(concept_info_sinonim) except Exception as e: print(e)
def get_clues(text): text = text print("*--------(%s)-------------*" % (text)) print(type(text)) nlp = StanfordCoreNLP('http://localhost:9001') stop_words = set(stopwords.words('english')) ''' Method to remove numbers appended at last ''' dep_parse = nlp.annotate(text, properties={ 'annotators': 'depparse', 'outputFormat': 'json', 'timeout': 10000, }) pos = nlp.annotate(text, properties={ 'annotators': 'lemma', 'outputFormat': 'json', 'timeout': 10000, }) sn = SenticNet() word_to_dep = [{} for i in range(len(dep_parse['sentences']))] word_to_par = [{} for i in range(len(dep_parse['sentences']))] word_to_pos = [{} for i in range(len(dep_parse['sentences']))] word_to_lemma = [{} for i in range(len(dep_parse['sentences']))] word_to_child = [{} for i in range(len(dep_parse['sentences']))] sents = [[] for i in range(len(dep_parse['sentences']))] index_to_word = {} ''' Constructing dicts for maintaining the dependencies among words. ''' ''' Appending each word by occurence number to maintain distinct word count ''' #print(dep_parse['sentences']) print("********") for i, sent in enumerate(dep_parse['sentences']): for dep in sent['basicDependencies']: word_to_dep[i][dep['dependentGloss'] + str(dep['dependent'])] = dep['dep'] word_to_par[i][dep['dependentGloss'] + str(dep['dependent'])] = dep['governorGloss'] + str( dep['governor']) index_to_word[dep['dependentGloss'] + str(dep['dependent'])] = dep['dependentGloss'] if (dep['governorGloss'] + str(dep['governor']) not in word_to_child[i]): word_to_child[i][dep['governorGloss'] + str(dep['governor'])] = [] if (dep['dependentGloss'] + str(dep['dependent']) not in word_to_child[i]): word_to_child[i][dep['dependentGloss'] + str(dep['dependent'])] = [] word_to_child[i][dep['governorGloss'] + str(dep['governor'])].append( dep['dependentGloss'] + str(dep['dependent'])) sents[i].append(dep['dependentGloss'] + str(dep['dependent'])) word_to_dep[i]['ROOT0'] = 'root' word_to_par[i]['ROOT0'] = 'root' for i, sent in enumerate(pos['sentences']): for pos_tagger in sent['tokens']: word_to_pos[i][pos_tagger['word']] = pos_tagger['pos'] word_to_lemma[i][pos_tagger['word']] = pos_tagger['lemma'] word_to_pos[i]['ROOT'] = 'root' word_to_lemma[i]['ROOT'] = 'root' ''' Displaying the deps ''' ##Implemeting rules to extract aspects for i, sent in enumerate(sents): if (__name__ == '__main__'): print(word_to_dep[i], word_to_par[i], word_to_pos[i]) print("Children==>") print(word_to_child[i]) aspects = [] for i, sent in enumerate(sents): for word in sent: ''' Rule 0 ''' if ('subj' in word_to_dep[i][word]): for child in word_to_child[i][word_to_par[i][word]]: if ('amod' in word_to_dep[i][child] or 'advmod' in word_to_dep[i][child]): aspects.append(word_to_par[i][word]) if (__name__ == '__main__'): print("Rule 0 triggered.") ''' Rule 1 (without sub): Very big to hold. ''' if (word_to_dep[i][word] == 'xcomp' and ('JJ' in word_to_pos[i][index_to_word[word_to_par[i][word]]] or 'RB' in word_to_pos[i][index_to_word[word_to_par[i][word]]])): if (__name__ == '__main__'): print("Rule 1 triggered") aspects.append(word_to_par[i][word]) ''' Rule 2 (without subj): Not to mention the price of the phone ''' if (word_to_dep[i][word] == 'dobj' and 'VB' in word_to_pos[i][index_to_word[(word_to_par[i][word])]] and ('NN' in word_to_pos[i][index_to_word[(word)]] or 'JJ' in word_to_pos[i][index_to_word[(word)]])): aspects.append(word) if (__name__ == '__main__'): print("Rule 2 triggered") print(word) ''' Rule 3 (without subj): Love the sleekness of the player ''' if ('NN' in word_to_pos[i][index_to_word[(word)]] and word_to_dep[i][word] == 'nmod'): aspects.append(word_to_par[i][word]) if (__name__ == '__main__'): print("Rule 3 triggered") print(word_to_par[i][word]) ''' Rule 4 (with sub): The battery lasts little two aspects ''' if (word_to_dep[i][word] == 'advmod' or word_to_dep[i][word] == 'amod' or word_to_dep[i][word] == 'advcl') and ('VB' in word_to_pos[i][index_to_word[( word_to_par[i][word])]]): aspects.append(word_to_par[i][word]) for word2 in sent: if (word2 != word and word_to_dep[i][word2] == 'nsubj' and word_to_par[i][word2] == word_to_par[i][word] and ('NN' in word_to_pos[i][index_to_word[word2]] or 'JJ' in word_to_pos[i][index_to_word[word2]])): aspects.append(word2) if (__name__ == '__main__'): print("Rule 4 triggered") print(word2) ''' Rule 5 (with sub): I like the lens of this camera ''' if ('NN' in word_to_pos[i][index_to_word[(word)]] and word_to_dep[i][word] == 'dobj'): if (__name__ == '__main__'): print("Rule 5 triggered") print(word) try: concept_info = sn.concept((word)) print("present in senticnet") except KeyError: print("Yay") aspects.append(word) ''' Rule 6 : I like the beauty of the screen. Check if senticnet condition should be added ''' if ('NN' in word_to_pos[i][index_to_word[(word)]] and word_to_dep[i][word] == 'dobj'): try: concept_info = sn.concept((word)) aspects.append(word) print("yay!") except KeyError: print("oops, not there in SenticNet") for word2 in sent: if (word2 != word and word_to_par[i][word2] == word and 'NN' in word_to_pos[i][index_to_word[(word2)]]): aspects.append(word2) if (__name__ == '__main__'): print("Rule 6 triggered.") print(word2) ''' Rule 7 : I would like to comment on the camera of this phone. ''' if (word_to_dep[i][word] == 'xcomp'): try: concept_info = sn.concept((word)) aspects.append(word) print("yay!") except KeyError: print("oops, not there in SenticNet") for child in word_to_child[i][word]: if ('NN' in word_to_pos[i][index_to_word[child]]): aspects.append(child) if (__name__ == '__main__'): print("Rule 7 triggered.") print(word) print(child) ''' Rule 8 : The car is expensive. ''' if (word_to_dep[i][word] == 'nsubj'): for word2 in sent: if (word2 != word and word_to_dep[i][word2] == 'cop' and word_to_par[i][word2] == word_to_par[i][word]): aspects.append(word_to_par[i][word]) if (__name__ == '__main__'): print("Rule 8 triggered") print(word_to_par[i][word]) ''' Rule 9 : The camera is nice. ''' if (word_to_dep[i][word] == 'nsubj' and 'NN' in word_to_pos[i][index_to_word[(word)]]): for word2 in sent: if (word2 != word and word_to_dep[i][word2] == 'cop' and word_to_par[i][word2] == word_to_par[i][word]): aspects.append(word) if (__name__ == '__main__'): print("Rule 9 triggered") print(word) ''' Rule 10 : The phone is very lightweight to carry. ''' if (word_to_dep[i][word] == 'cop'): for word2 in sent: if (word2 != word and 'VB' in word_to_pos[i][index_to_word[(word2)]] and word_to_par[i][word] == word_to_par[i][word2]): aspects.append(word2) if (__name__ == '__main__'): print("Rule 10 triggered.") print(word2) ''' Extracting mods of dobjs ''' if (word_to_dep[i][word] == 'dobj'): for child in word_to_child[i][word]: if ('mod' in word_to_dep[i][child] and 'JJ' in word_to_pos[i][index_to_word[(child)]]): aspects.append(child) ''' Rule 11 : Checking for conjuctions ''' for asp in aspects: for word in sent: if (word_to_dep[i][word] == 'conj' and word_to_par[i][word] == asp): aspects.append(word) if (__name__ == '__main__'): print("Rule conj triggered.") print(word) finalIAC = set(aspects) finalIAC = [index_to_word[f] for f in finalIAC] finalIAC = [w for w in finalIAC if not w in stop_words] finalSenti = [] for iac in finalIAC: try: concept_info = sn.concept((iac)) finalSenti.append(iac) except KeyError: print("No word available for " + iac) return finalIAC, finalSenti
class Get_IAC(): def __init__(self): self.col = ['Name', 'Brand', 'Price', 'Title', 'Score', 'Time', 'Text'] self.sn = SenticNet('en') self.wordnet_lemmatizer = WordNetLemmatizer() def review_to_sentences(self, review): # review = review.replace(',','.') review = review.replace('.', '. ') raw_sentences = sent_tokenize(review) return raw_sentences def InputData(self, input_path): self.dict_list = [] if '.csv' in input_path: with open(input_path, 'r', encoding='utf8') as f: reader = csv.DictReader(f) for row in reader: d = {i: row[i] for i in col} self.dict_list.append(d) elif '.xlsx' in input_path: wb = load_workbook(input_path) sheet = wb.active count = 0 for row in sheet.rows: if count == 0: count += 1 continue d = {} name = 0 for cell in row: d[self.col[name]] = cell.value name += 1 self.dict_list.append(d) self.dict_list = [ x for x in self.dict_list if x['Text'] != '' and x['Text'] != None ] self.sentences = [] for i in range(len(self.dict_list)): for j in self.review_to_sentences(self.dict_list[i]['Text']): self.sentences.append(j) self.sentences = [x for x in self.sentences if len(x) >= 5] def GetIAC(self): self.nlp = StanfordCoreNLP(r'stanford-corenlp-full-2018-10-05') self.IAC = [] for i in tqdm(self.sentences): dependency = self.nlp.dependency_parse(i) token = self.nlp.word_tokenize(i) if [x for x in dependency if 'compound' in x] != []: for j in [x for x in dependency if 'compound' in x]: token[j[2] - 1] = token[j[2] - 1] + '-' + token[j[1] - 1] token[j[1] - 1] = '' i = ' '.join(token) parse = self.nlp.parse(i) dependency = self.nlp.dependency_parse(i) pos = self.nlp.pos_tag(i) token = [] for j in pos: wordnet_pos = self.get_wordnet_pos(j[1]) token.append( self.wordnet_lemmatizer.lemmatize(j[0].lower(), pos=wordnet_pos)) # subject noun relation if [x for x in dependency if 'nsubj' in x] != []: for j in self.Subject_Noun_Rule(parse, dependency, token, pos): self.IAC.append(j) else: # Non subject noun relation for j in self.Non_Subject_Noun_Rule(parse, dependency, token, pos): self.IAC.append(j) self.nlp.close() self.IAC = list(set(self.IAC)) def get_wordnet_pos(self, treebank_tag): if treebank_tag.startswith('J'): return wn.ADJ elif treebank_tag.startswith('V'): return wn.VERB elif treebank_tag.startswith('N'): return wn.NOUN elif treebank_tag.startswith('R'): return wn.ADV else: return wn.NOUN # Additional Rule: 對等連接詞 def Conj(self, index, dependency, token): IAC = [] index = list(set(index)) if [x for x in dependency if 'conj' in x] != []: conj = [x for x in dependency if 'conj' in x] for j in conj: if j[1] in index or j[2] in index: if j[1] not in index: IAC.append(token[j[1] - 1]) index.append(j[1]) if j[2] not in index: IAC.append(token[j[2] - 1]) index.append(j[2]) return IAC def Subject_Noun_Rule(self, parse, dependency, token, pos): be = ['is', 'was', 'am', 'are', 'were'] adv_mod = [x for x in dependency if 'advmod' in x] adj_mod = [x for x in dependency if 'amod' in x] active_token = token[[x for x in dependency if 'nsubj' in x][0][2] - 1] # 主詞 result = [] index = [] if adv_mod != [] or adj_mod != []: A, B = self.Rule1(adv_mod, adj_mod, active_token, token) result += A index += B # does not have auxiliary verb if any(k in token for k in be) == False and [x for x in pos if 'MD' in x] == []: A, B = self.Rule2(token, pos, dependency, active_token, adv_mod, adj_mod) result += A index += B if [x for x in dependency if 'dobj' in x] != []: A, B = self.Rule3(dependency, token, pos) result += A index += B if [x for x in dependency if 'xcomp' in x] != []: A, B = self.Rule4(dependency, token, pos) result += A index += B if [x for x in dependency if 'cop' in x] != []: A, B = self.Rule5(dependency, pos, active_token, token) result += A index += B result += self.Conj(index, dependency, token) return list(set(result)) # 3.3.3 Rule 1 def Rule1(self, adv_mod, adj_mod, active_token, token): IAC = [] index = [] if adv_mod != []: for j in adv_mod: try: concept = self.sn.concept(token[j[2] - 1]) IAC.append(token[j[2] - 1]) index.append(j[2]) except: a = 0 # print(token[j[2]-1] + ' Not in SenticNet') if adj_mod != []: for j in adj_mod: try: concept = self.sn.concept(token[j[2] - 1]) IAC.append(token[j[2] - 1]) index.append(j[2]) except: a = 0 # print(token[j[2]-1] + ' Not in SenticNet') return IAC, index # 3.3.3 Rule 2-1 def Rule2(self, token, pos, dependency, active_token, adv_mod, adj_mod): IAC = [] index = [] advcl = [x for x in dependency if 'advcl' in x] # adverbial clause modifier if advcl != []: for j in advcl: IAC.append(token[j[1] - 1]) index.append(j[1]) IAC.append(active_token) index.append([x for x in dependency if 'nsubj' in x][0][2]) if adv_mod != []: for j in adv_mod: IAC.append(token[j[1] - 1]) index.append(j[1]) IAC.append(active_token) index.append([x for x in dependency if 'nsubj' in x][0][2]) if adj_mod != []: for j in adj_mod: IAC.append(token[j[1] - 1]) index.append(j[1]) IAC.append(active_token) index.append([x for x in dependency if 'nsubj' in x][0][2]) return IAC, index # 3.3.3 Rule 2-2 & 2-3 def Rule3(self, dependency, token, pos): IAC = [] index = [] dobj = [x for x in dependency if 'dobj' in x] # open clausal complement for j in dobj: if pos[j[2] - 1][1] == 'NN': try: # Rule 2-3 concept = self.sn.concept(token[j[2] - 1]) IAC.append(token[j[2] - 1]) index.append(j[2]) conj = [] conj.append(j[2]) if [x for x in dependency if 'conj' in x and j[2] in x ] != []: for i in [ x for x in dependency if 'conj' in x and j[2] in x ]: conj.append(i[1]) conj.append(i[2]) conj = list(set(conj)) for i in conj: t1 = i connect = [x for x in dependency if t1 in x] for k in connect: if k[1] != t1: if pos[k[1] - 1][1] == 'NN': IAC.append(token[k[1] - 1]) index.append(k[1]) if k[2] != t1: if pos[k[2] - 1][1] == 'NN': IAC.append(token[k[2] - 1]) index.append(k[2]) except: # Rule 2-2 IAC.append(token[j[2] - 1]) index.append(j[2]) # print(token[j[2]-1] + ' Not in SenticNet') return IAC, index # 3.3.3 Rule 2-4 def Rule4(self, dependency, token, pos): IAC = [] index = [] xcomp = [x for x in dependency if 'xcomp' in x] # open clausal complement for j in xcomp: try: concept = self.sn.concept(token[j[1] - 1] + '-' + token[j[2] - 1]) IAC.append(token[j[1] - 1] + '-' + token[j[2] - 1]) except: a = 0 # print(token[j[1]-1] + '-' + token[j[2]-1] + ' Not in SenticNet') t1 = j[2] connect = [x for x in dependency if t1 in x] for k in connect: if pos[k[2] - 1][1] == 'NN': IAC.append(token[k[2] - 1]) index.append(k[2]) return IAC, index # 3.3.3 Rule 3 & 3.3.3 Rule 4 & 3.3.3 Rule 5 def Rule5(self, dependency, pos, active_token, token): IAC = [] index = [] cop = [x for x in dependency if 'cop' in x] # copula # Rule 4 if pos[[x for x in dependency if 'nsubj' in x][0][2] - 1][1] == 'NN': IAC.append(active_token) index.append([x for x in dependency if 'nsubj' in x][0][2]) # Rule 3 & Rule 5 for j in cop: # Rule 3 conj = [] # if token[j[1]-1] in all_term: IAC.append(token[j[1] - 1]) index.append(j[1]) conj.append(j[1]) if [x for x in dependency if 'conj' in x and j[1] in x] != []: for i in [x for x in dependency if 'conj' in x and j[1] in x]: conj.append(i[1]) conj.append(i[2]) # Rule 5 conj = list(set(conj)) for i in conj: t1 = i connect = [x for x in dependency if t1 in x] for k in connect: if k[1] != t1: if pos[k[1] - 1][1] == 'VB' or pos[k[1] - 1][1] == 'VV': IAC.append(token[k[1] - 1]) index.append(k[1]) if token[t1 - 1] not in IAC: IAC.append(token[t1 - 1]) index.append(t1) if k[2] != t1: if pos[k[2] - 1][1] == 'VB' or pos[k[2] - 1][1] == 'VV': IAC.append(token[k[2] - 1]) index.append(k[2]) if token[t1 - 1] not in IAC: IAC.append(token[t1 - 1]) index.append(t1) return IAC, index def Non_Subject_Noun_Rule(self, parse, dependency, token, pos): result = [] index = [] if [x for x in dependency if 'xcomp' in x] != []: A, B = self.Rule6(dependency, token) result += A index += B if [x for x in dependency if 'case' in x] != []: A, B = self.Rule7(dependency, pos, token) result += A index += B if [x for x in dependency if 'dobj' in x] != []: A, B = self.Rule8(dependency, token) result += A index += B result += self.Conj(index, dependency, token) return list(set(result)) # 3.3.4 Rule 1 def Rule6(self, dependency, token): IAC = [] index = [] xcomp = [x for x in dependency if 'xcomp' in x] # open clausal complement for j in xcomp: # if token[j[1]-1] in all_term: IAC.append(token[j[1] - 1]) index.append(j[1]) return IAC, index # 3.3.4 Rule 2 def Rule7(self, dependency, pos, token): IAC = [] index = [] case = [x for x in dependency if 'case' in x] # a prepositional relation for j in case: if pos[j[1] - 1][1] == 'NN': connect = [ x for x in dependency if j[1] in x and 'mod' in x[0] ] for i in connect: IAC.append(token[i[1] - 1]) IAC.append(token[i[2] - 1]) index.append(i[1]) index.append(i[2]) return list(set(IAC)), list(set(index)) # 3.3.4 Rule 3 def Rule8(self, dependency, token): IAC = [] index = [] dobj = [x for x in dependency if 'dobj' in x] # a direct object relation for j in dobj: IAC.append(token[j[2] - 1]) index.append(j[2]) return IAC, index def Save(self, output_path): with open(output_path, 'wb') as f: pickle.dump(self.IAC, f)
def get_clues(text): text = text print("*--------(%s)-------------*" % (text)) print(type(text)) nlp = StanfordCoreNLP('http://localhost:9001') stop_words = set(stopwords.words('english')) ''' Method to remove numbers appended at last ''' dep_parse = nlp.annotate(text, properties={ 'annotators': 'depparse', 'outputFormat': 'json', 'timeout': 10000, }) pos = nlp.annotate(text, properties={ 'annotators': 'lemma', 'outputFormat': 'json', 'timeout': 10000, }) sn = SenticNet() word_to_dep = [{} for i in range(len(dep_parse['sentences']))] word_to_par = [{} for i in range(len(dep_parse['sentences']))] word_to_pos = [{} for i in range(len(dep_parse['sentences']))] word_to_lemma = [{} for i in range(len(dep_parse['sentences']))] word_to_child = [{} for i in range(len(dep_parse['sentences']))] sents = [[] for i in range(len(dep_parse['sentences']))] index_to_word = {} aspect_result = [[] for i in range(len(dep_parse['sentences']))] ''' Constructing dicts for maintaining the dependencies among words. ''' ''' Appending each word by occurence number to maintain distinct word count ''' print("********") for i, sent in enumerate(dep_parse['sentences']): for dep in sent['basicDependencies']: word_to_dep[i][dep['dependentGloss'] + str(dep['dependent'])] = dep['dep'] word_to_par[i][dep['dependentGloss'] + str(dep['dependent'])] = dep['governorGloss'] + str( dep['governor']) index_to_word[dep['dependentGloss'] + str(dep['dependent'])] = dep['dependentGloss'] if (dep['governorGloss'] + str(dep['governor']) not in word_to_child[i]): word_to_child[i][dep['governorGloss'] + str(dep['governor'])] = [] if (dep['dependentGloss'] + str(dep['dependent']) not in word_to_child[i]): word_to_child[i][dep['dependentGloss'] + str(dep['dependent'])] = [] word_to_child[i][dep['governorGloss'] + str(dep['governor'])].append( dep['dependentGloss'] + str(dep['dependent'])) sents[i].append(dep['dependentGloss'] + str(dep['dependent'])) word_to_dep[i]['ROOT0'] = 'root' word_to_par[i]['ROOT0'] = 'root' for i, sent in enumerate(pos['sentences']): for pos_tagger in sent['tokens']: word_to_pos[i][pos_tagger['word']] = pos_tagger['pos'] word_to_lemma[i][pos_tagger['word']] = pos_tagger['lemma'] word_to_pos[i]['ROOT'] = 'root' word_to_lemma[i]['ROOT'] = 'root' ''' Displaying the deps ''' ##Implemeting rules to extract aspects for i, sent in enumerate(sents): if (__name__ == '__main__'): print(word_to_dep[i], word_to_par[i], word_to_pos[i], word_to_lemma[i]) print("Children==>") print(word_to_child[i]) for i, sent in enumerate(sents): token_t = word_to_child[i]['ROOT0'][0] is_sub = False token_h = None for child in word_to_child[i][token_t]: if 'subj' in word_to_dep[i][child]: is_sub = True token_h = child #If subject noun relationship present if is_sub: """ Rule 0: if any adv or adj modifies the token t. """ for child in word_to_child[i][token_t]: if ('amod' in word_to_dep[i][child] or 'advmod' in word_to_dep[i][child]): try: concept_info = sn.concept(index_to_word[child]) aspect_result[i].append(token_t) if __name__ == '__main__': print("Rule 1 triggered.") print("present in senticnet") except KeyError: print("OOps") """ Rule 1: The battery lasts little. """ for child in word_to_child[i][token_t]: if (word_to_dep[i][child] == 'advmod' or word_to_dep[i][child] == 'amod' or word_to_dep[i][child] == 'advcl') and ( 'VB' in word_to_pos[i][index_to_word[token_t]]): aspect_result[i].append(token_t) aspect_result[i].append(token_h) if __name__ == '__main__': print("Rule 1 triggered.") print(token_t) print(token_h) """ Rule 2: I like the beauty of the screen (and I like the lens of this camera). """ for child in word_to_child[i][token_t]: if (word_to_dep[i][child] == 'dobj' and 'NN' in word_to_pos[i][index_to_word[child]]): aspect_result[i].append(child) if __name__ == '__main__': print(child) try: concept_info = sn.concept(index_to_word[child]) if __name__ == '__main__': print("Rule 2 triggered") for grandchild in word_to_child[i][child]: if ('NN' in word_to_pos[i][ index_to_word[grandchild]]): aspect_result[i].append(grandchild) print(grandchild) except KeyError: print("OOps") """ Rule 3: I would like to comment on the camera of this phone. """ for child in word_to_child[i][token_t]: if (word_to_dep[i][child] == 'xcomp'): try: sn.concept(index_to_word[child]) aspect_result[i].append(child) if __name__ == '__main__': print(child) except KeyError: print("OOps") for grandchild in word_to_child[i][child]: if ('NN' in word_to_pos[i][index_to_word[grandchild]]): aspect_result[i].append(grandchild) if __name__ == '__main__': print(grandchild) print("Rule 3 triggered.") """ Rule 4: The car is expensive. """ for child in word_to_child[i][token_t]: if (word_to_dep[i][child] == 'cop'): try: sn.concept(word_to_lemma[i][index_to_word[token_t]]) aspect_result[i].append(token_t) if __name__ == '__main__': print("Rule 4 triggered") print(token_t) except KeyError: pass """ Rule 5: The camera is nice """ for child in word_to_child[i][token_t]: if (word_to_dep[i][child] == 'cop' and 'NN' in word_to_pos[i][index_to_word[token_h]]): aspect_result[i].append(token_h) if __name__ == '__main__': print("Rule 5 triggered.") print(token_h) """ Rule 6: """ for child in word_to_child[i][token_t]: if (word_to_dep[i][child] == 'cop'): for child2 in word_to_child[i][token_t]: if (child != child2 and 'VB' in word_to_pos[i][index_to_word[child2]]): try: sn.concept(index_to_word[token_t]) sn.concept(index_to_word[child2]) aspect_result[i].append(token_t) aspect_result[i].append(child2) if __name__ == '__main__': print("rule 6 trigg") print(token_t) print(child2) except KeyError: pass else: """ Rule 7:Very big to hold. """ for word in sent: if ('RB' in word_to_pos[i][index_to_word[word]] or 'JJ' in word_to_pos[i][index_to_word[word]]): for child in word_to_child[i][word]: if (word_to_dep[i][child] == 'xcomp' or word_to_dep[i][child] == 'ccomp'): aspect_result[i].append(word) if __name__ == '__main__': print("Rule 7 triggered") print(word) """ Rule 8: Love the sleekness of the player. """ for word in sent: for child in word_to_child[i][word]: if ('NN' in word_to_pos[i][index_to_word[child]] and word_to_dep[i][child] == 'nmod'): for grandchild in word_to_child[i][child]: if ('IN' in word_to_pos[i][ index_to_word[grandchild]]): aspect_result[i].append(word) aspect_result[i].append(child) if __name__ == '__main__': print(word) print(child) print("Rule 8 triggered.") """ Rule 9: Not to mention the price of the phone. """ for word in sent: for child in word_to_child[i][word]: if (word_to_dep[i][child] == 'dobj'): aspect_result[i].append(child) if __name__ == '__main__': print(child) print("Rule 9 triggered") ''' Rule 11 : Checking for conjuctions ''' for asp in aspect_result[i]: for word in sent: if (word_to_dep[i][word] == 'conj' and word_to_par[i][word] == asp): aspect_result[i].append(word) if (__name__ == '__main__'): print("Rule conj triggered.") print(word) finalIAC = [set(aspect_result[i]) for i in range(len(sents))] finalIAC = [[index_to_word[w] for w in finalIAC[i]] for i in range(len(sents))] print(finalIAC) singleFinalIAC = [] for i in range(len(sents)): for w in finalIAC[i]: if w not in stop_words: singleFinalIAC.append(w) print(singleFinalIAC) finalSenti = [] for iac in singleFinalIAC: try: concept_info = sn.concept((iac)) finalSenti.append(iac) except KeyError: print("No word available for " + iac) return singleFinalIAC, finalSenti
# -*- coding: utf-8 -*- """ Spyder Editor This is a temporary script file. """ import nltk nltk.download() import os from senticnet.senticnet import SenticNet sn = SenticNet() sn.concept('') def fun1(d): try: from senticnet.senticnet import SenticNet sn = SenticNet() sn.semantics(d) return True except KeyError as error: return False fun1('day') from nltk.corpus import wordnet sk = wordnet.synset('ssd')
from senticnet.senticnet import SenticNet sn = SenticNet('ru') word = input('Введите ваш комментарий(например "как дела"): ') lst = word.split() #concept_info = sn.concept(word) #polarity_value = sn.polarity_value(word) #polarity_intense = sn.polarity_intense(word) #moodtags = sn.moodtags(word) #semantics = sn.semantics(word) print(list(map(lambda x: sn.sentics(x), lst))) pop = input(" ")
import bagofconcepts as boc # Each line of corpus must be equivalent to each document of the corpus #boc_model=boc.BOCModel(doc_path="input corpus path") boc_model = boc.BOCModel('text.txt') #boc_model.context = text # output can be saved with save_path parameter boc_matrix, word2concept_list, idx2word_converter = boc_model.fit() # SenitcNet lexicon lookup from senticnet.senticnet import SenticNet sn = SenticNet() concept_info = sn.concept(text) polarity_value = sn.polarity_value(text) polarity_intense = sn.polarity_intense(text) moodtags = sn.moodtags(text) semantics = sn.semantics(text) sentics = sn.sentics(text) print('==================================') print('test: ', text) print('concept_info: ', concept_info) print('polarity_value: ', polarity_value) print('polarity_intense: ', polarity_intense) print('moodtags: ', moodtags) print('semantics: ', semantics)
def data_Preprocessing(data, data_test, n_of_words, polarity_threshold): Reviews = data["Content"] #Check if all char are ASCII # If we need another method for Encode/Decode there is string.printable method for i in range(0, len(Reviews)): x = Reviews.iloc[i].encode('ascii', errors='ignore').decode() # Set all the content to lower case Reviews = Reviews.apply(lambda row: row.lower()) # Add to the follow variable the characters that you want to delete chars_to_del = "[" + string.punctuation + string.digits + "]" # Delete all the chars in "chars_to_del" from each row of the dataframe Reviews = Reviews.apply(lambda row: re.sub(chars_to_del, '', row)) # Tokenize every single words of the data content Token_Reviews = Reviews.apply(lambda row: nltk.word_tokenize(row)) # Generating the list "stop" of element TO BE REMOVED from the sentences (stopwords, numbers and punctuations) stop = stopwords.words("english") # Remove all the words in the variable "stop" Filtered_Review = Token_Reviews.apply( lambda row: [w for w in row if not w in stop]) # Stemming the data's content # Stemming the Filtered sentence, some stemmed words: # http://snowball.tartarus.org/algorithms/english/stemmer.html ps = PorterStemmer() for idx in range(0, len(Filtered_Review)): Stemmed_Review_temp = [] for word in Filtered_Review.iloc[i]: Stemmed_Review_temp.append(ps.stem(word)) Filtered_Review.iloc[i] = Stemmed_Review_temp # Terms choosing: most common word sn = SenticNet() Filtered_Review_List = list(itertools.chain.from_iterable(Filtered_Review)) Words_Frquency = FreqDist(Filtered_Review_List) Most_Common_Words_Frequency = Words_Frquency.most_common(n_of_words) Most_Common_Words = [] for i in range(0, n_of_words): Most_Common_Words.append(Most_Common_Words_Frequency[i][0]) index = 1 words_and_polarity = pd.DataFrame(columns=["Word", "Polarity"]) Selected_Words = [] # Terms polarity for word in Most_Common_Words: try: temp = sn.polarity_intense(word) if (float(temp) > polarity_threshold or float(temp) < -(polarity_threshold)): words_and_polarity.loc[index] = [word, float(temp)] index = index + 1 Selected_Words.append(word) except Exception: continue # Decomment if you want to recomputer the selected words and their polarity #words_and_polarity.to_csv("Words_and_Polarity.csv", sep=",") return data, data_test
# for w in Dict.keys(): # if w not in porvalis and w not in slangwords and w not in kaggeleSentiment: # print(w) # c = c + 1 # print(c) def Precision(tp, fp): return tp / (tp + fp) def Recall(tp, fn): return tp / (tp + fn) sn = SenticNet() zeroSen = 0 tp = 0 tn = 0 fp = 0 fn = 0 actT = 0 with open("Dataset.pickle", "rb") as handle: pyDS = pickle.load(handle) for doc in pyDS.DocList: totSen = 0 for w in doc.TermList: try: sen = sn.polarity_intense(w) except KeyError: sen = 0
import stanza as st import csv import textstat import numpy as np import matplotlib.pyplot as plt import random import copy from gensim.models import KeyedVectors from senticnet.senticnet import SenticNet sn = SenticNet() import language_tool_python tool = language_tool_python.LanguageTool('en-US') from transformers import pipeline from wordhoard import antonyms,synonyms from nltk import tokenize from nltk.corpus import wordnet from nltk.tokenize.treebank import TreebankWordDetokenizer nlp = pipeline("fill-mask",model="bert-large-uncased",tokenizer="bert-large-uncased") surname = np.load('surname.npy', allow_pickle=True) malenames = np.load('male.npy', allow_pickle=True) femalenames = np.load('female.npy', allow_pickle=True) unsex = np.load('unsex.npy', allow_pickle=True) boy = np.load('boy.npy', allow_pickle=True) girl = np.load('girl.npy', allow_pickle=True) gender_word=[['boy', 'girl'], ['boys', 'girls'], ['nephew', 'niece'], ['brother', 'sister'], ['brothers', 'sisters'],['boyfriend', 'girlfriend'], ['dad', 'mom'], ['father', 'mother'], ['grandfather', 'grandmother'], ['grandpa', 'grandma'], ['grandson', 'granddaughter'], ['male','female'], ['groom', 'bride'], ['husband', 'wife'], ['king', 'queen'], ['man', 'woman'],['men','women'], ['policeman', 'policewoman'], ['prince', 'princess'],
from senticnet.senticnet import SenticNet sn = SenticNet() print("polarity value:", sn.polarity_value("love")) print("polarity intense:", sn.polarity_intense("love")) print("moodtags:", ", ".join(sn.moodtags("love"))) print("semantics:", ", ".join(sn.semantics("love"))) print("\n".join([key + ": " + str(value) for key, value in sn.sentics("love").items()]))
# Классификация текстов по 5 шкалам согласно базе SenticNet, оценка XGBoost import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from senticnet.senticnet import SenticNet from nltk.stem import WordNetLemmatizer from sklearn import metrics import xgboost from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV df = pd.read_csv('preprocessed.csv') lemmatizer = WordNetLemmatizer() # преобразование слов в исходную форму sn = SenticNet() # база данных, содержащая классификацию слов и выражений по значению и настроению # Функция, возвращающая оценки введенного текста по 5 шкалам базы SenticNet: polarity intensity, # pleasantness, attention, sensitivity и aptitude. Оценка формируется как сумма оценок # всех включенных в базу слов и выражений из текста и нормируется на кол-во слов в тексте def SN(data): # Преобразование текста в вектор, формирование словаря слов и словосочетаний длинной до 3 слов включительно vectorizer = CountVectorizer(analyzer='word', ngram_range=(1,3)) vec = vectorizer.fit_transform([data]).todense() k = 0 polarity_intense = sentics_pleasant = sentics_attention = sentics_sense = sentics_aptitude = 0 for i in vectorizer.vocabulary_.keys(): try: # Попытка найти i-ое слово/выражение в базе num_repetitions = vec[0, vectorizer.vocabulary_[i]] polarity_intense += (float(sn.polarity_intense(i)) * num_repetitions) sentics_pleasant += (float(sn.sentics(i)['pleasantness']) * num_repetitions)
def __init__(self): self.col = ['Name', 'Brand', 'Price', 'Title', 'Score', 'Time', 'Text'] self.sn = SenticNet('en') self.wordnet_lemmatizer = WordNetLemmatizer()
class Sarcasm: def __init__(self, *args, **kwargs): # loading necessaries self.nlp = spacy.load("en_core_web_sm") self.senti = SenticNet() self.sid = SentimentIntensityAnalyzer() #loading dataset self.df = pd.read_json("./Sarcasm_Headlines_Dataset.json", lines=True) self.df = self.df[:15000] self.df.drop(columns="article_link", inplace=True) #dropping unnessary attribute #storing nlp data in headlines variable self.headlines = [] self.uni_gram = set() self.uni_feature = [] self.y_ = [] for i in self.df['headline']: self.headlines.append(self.nlp(i)) def w_score(self, w): """ input: word Finding word score based on nltk's vader_lexicon sentiment analysis and Senticnet sentiment analysis """ ss = self.sid.polarity_scores(w)['compound'] try: sn = self.senti.polarity_intense(w) sn = float(sn) if ss == 0: return sn else: return (sn + ss) / 2 except: #not found in sn find for only ss or concepts if ss != 0: return ss elif ss == 0: #find for the concepts return ss def sentimentScore(self, sent): """ input: sentence Return if contradiction occurs or not """ sum_pos_score = 0 sum_neg_score = 0 for w in sent: if w.lemma_ == '-PRON-': score = self.w_score(w.text) else: score = self.w_score(w.lemma_) if score > 0: sum_pos_score += score else: sum_neg_score += score if sum_pos_score > 0 and sum_neg_score < 0: return ("contradict", sum_pos_score, sum_neg_score) else: return ("anything", sum_pos_score, sum_neg_score) def coherence(self, s1, s2): ''' Input sentence1, sentence2 using nlp Rule1:- Pronoun match feature - including reflexive, personal, and possessive pronouns. Rule2:- String match feature - ignore stop words Rule3:- Definite noun phrase - w2 starts with the word 'the' Rule4:- Demonstrative noun phrase feature - w2 starts with the "this", "that", "these" and "those" Rule5:- Both proper names features - w1 and w2 are both named entities ''' # subject and object of s1 and s2 sub1 = "" sub2 = "" obj1 = "" obj2 = "" for i in s1.noun_chunks: if i.root.dep_ == 'nsubj': sub1 = i.root if i.root.dep == 'pobj': obj1 = i.root for j in s2.noun_chunks: if j.root.dep_ == 'nsubj': if type(sub1) != type( "") and sub1.pos_ == 'PRON' and j.root.pos_ == 'PRON': if sub1.text.lower() == j.root.text.lower(): return "coherent" # rule 4:- if j[0].text.lower() == 'the': return "coherent" if j[0].text.lower() in ['this', 'that', 'these', 'those']: return "coherent" if j.root.dep_ == 'pobj': if type(obj1) != type( "") and obj1.pos_ == 'PRON' and j.root.pos_ == 'PRON': if obj1.text.lower() == j.root.text.lower(): return "coherent" return "Not coherent" def to_string_from_list(self, l): st = "" for i in l: st += i + ' ' return st.rstrip() def n_gram_feature(self, text, n): """ Input: headline in nlp Finding n grams of given text """ one_list = [] for tok in text: if not tok.is_punct: if tok.lemma_ != '-PRON-': one_list.append(tok.lemma_) else: one_list.append(tok.text) try: one_list.remove(' ') except: pass #convert it to n-gram _list = [] for i, t in enumerate(one_list): if len(one_list[i:n + i]) >= n: _list.append(self.to_string_from_list(one_list[i:n + i])) return set(_list) def contradiction_feature(self, headline): ''' Contradiction feature input: nlp processed ''' #for single sentence headline if len(list(headline.sents)) == 1: if self.sentimentScore(headline)[0] == 'contradict': return (1, 0) else: return (0, 0) #for multisentence headline else: if self.sentimentScore(headline)[0] == 'contradict': sent = list(headline.sents) i = 0 while i < len(sent) - 1: # number of sentece if self.coherence(sent[i], sent[i + 1]) is not "coherent": return (0, 0) i += 1 return (0, 1) else: return (0, 0) def baseline3(self): ''' Use of sentiment analysis + coherence ''' predictions = [] for i in self.headlines: get = self.contradiction_feature(i) if get == (1, 0) or get == (0, 1): predictions.append(1) else: predictions.append(0) return (confusion_matrix(self.df['is_sarcastic'], predictions), classification_report(self.df['is_sarcastic'], predictions), accuracy_score(self.df['is_sarcastic'], predictions)) def baseline1(self): predictions = [] for p in self.headlines: co, _, _ = self.sentimentScore(p) if (co == 'contradict'): predictions.append(1) else: predictions.append(0) return (confusion_matrix(self.df['is_sarcastic'], predictions), classification_report(self.df['is_sarcastic'], predictions), accuracy_score(self.df['is_sarcastic'], predictions)) def uni_gram_features(self, start, end, n=1): self.uni_gram = list(self.uni_gram) self.uni_gram = sorted(self.uni_gram) index = start for p in self.headlines[start:end]: uni = [0 for i in range(len(self.uni_gram))] for i, j in enumerate(p): temp = [] #temp if len(p[i:n + i]) >= n: for k in range(n): if p[i + k].lemma_ != '-PRON-': temp.append(p[i + k].lemma_) else: temp.append(p[i + k].text) temp = self.to_string_from_list(temp) if temp in self.uni_gram: uni[self.uni_gram.index(temp)] = 1 self.y_.append(self.df['is_sarcastic'][index]) index += 1 self.uni_feature.append(uni) def baseline2(self, n=1): #unigram features self.uni_gram = set() self.uni_feature = [] self.y_ = [] for p in self.headlines: self.uni_gram = self.uni_gram.union(self.n_gram_feature(p, n)) #now find length = len(self.headlines) t1 = threading.Thread(target=self.uni_gram_features, name='t1', args=(0, int(length / 4), n)) t2 = threading.Thread(target=self.uni_gram_features, name='t2', args=(int(length / 4), int(length / 2), n)) t3 = threading.Thread(target=self.uni_gram_features, name='t3', args=(int(length / 2), int(3 * length / 4), n)) t4 = threading.Thread(target=self.uni_gram_features, name='t4', args=(int(3 * length / 4), length, n)) t1.daemon = True t2.daemon = True t3.daemon = True t4.daemon = True st = time.time() t1.start() t2.start() t3.start() t4.start() t1.join() t2.join() t3.join() t4.join() print(f'time taken: {time.time()-st}') X_train, X_test, y_train, y_test = train_test_split(self.uni_feature, self.y_, test_size=0.33, random_state=42) return self.findLINEARSVCResult(X_train, X_test, y_train, y_test) def findLINEARSVCResult(self, X_train, X_test, y_train, y_test): ''' Training data using LinearSVC model ''' svc_model = LinearSVC() svc_model.fit(X_train, y_train) predictions = svc_model.predict(X_test) return (confusion_matrix(y_test, predictions), classification_report(y_test, predictions), accuracy_score(y_test, predictions))