def getTestVector(self, sentence, opinionCategories, dom): test_pos = [] temp_vector = [] test_vector = [] chars_to_remove = ['=', '!', '?', ',', '<', '.', '>', '/', ';' ,':', ']', '}', '[', '{', '|', '@', '$', '%', '^', '&', '*', '(', ')', '_', '-', '+', '+', '"','1', '2', '3','4','5','6','7','8','9' ] #removing characters t2 = word_tokenize(sentence) capsList, capsCounter = self.listCaps(t2) #storing the caps words of the text text = word_tokenize(sentence.lower()) for opinionCategory in opinionCategories: test_pos.append(sentence) #calculate score for each lexicon temp0 = self.checkLexiconReady(text, self.stopwords, capsList, self.negation, self.AFINN_lexicon) #Afinn lexicon scores temp3 = self.checkLexiconReady(text, self.stopwords, capsList, self.negation, self.huLiu_lexicon) #Hu and Liu lexicon scores temp4 = self.checkLexiconReady(text, self.stopwords, capsList, self.negation, self.nrc_lexicon) #NRC lexicon scores temp1 = self.checkLexiconUni(text, chars_to_remove, self.stopwords, capsList, self.negation, self.train_unigram_lexicon) #unigram lexicon scores from the train data of each domain temp7 = self.howManyUpperFirst(t2) #num of words starting with capitalized first letter temp9 = [self.howMany(text, '?'), self.howMany(text, '!')] #number of question and exclamation marks temp11 = self.lastSymbol(t2) #is the last symbol a question or an exclamation mark cat = opinionCategory.split('#') #a feature for the entity and the attribute cat0 = [] for ent in self.entities: if ent == cat[0]: cat0.append(1) else: cat0.append(0) cat1 = [] for attr in self.attributes: if attr == cat[1]: cat1.append(1) else: cat1.append(0) temp12 = [len(opinionCategories)] + cat0 + cat1 temp = temp0 + temp1 + temp3 + temp4 + temp7 + temp9 + temp11 + [capsCounter] + temp12 temp_vector.append(temp) #creating the features vector temp_vector = self.normalize(temp_vector) #normalize the vector pos = arktagger.pos_tag_list(test_pos) #finding the pos tags test_pos = self.howManyPos(pos) test_pos_bi = self.calcScorePosBi(pos) test_pos_bi = self.normalize(test_pos_bi) for i in range(len(temp_vector)): #join the matrices test_vector.append(temp_vector[i] + test_pos[i] + test_pos_bi[i]) #print #print '---- End of Test ----' return test_vector
def train(self,dom): temp_vector = [] train_tags = [] train_pos = [] train_vector = [] chars_to_remove = ['=', '!', '?', ',', '<', '.', '>', '/', ';' ,':', ']', '}', '[', '{', '|', '@', '$', '%', '^', '&', '*', '(', ')', '_', '-', '+', '+', '"','1', '2', '3','4','5','6','7','8','9' ] #removing characters reviews = ET.parse(self.train_path).getroot().findall('Review') for review in reviews: sentences = review[0] #get the sentences for sentence in sentences: if (len(sentence) > 1): opinions = sentence[1] #getting the opinions field if ( len(opinions) > 0): #check if there are aspects t = sentence[0].text t2 = word_tokenize(t) #tokenize, don't convert to lower case, check for caps capsList, capsCounter = self.listCaps(t2) #storing the caps words of the text text = word_tokenize(t.lower()) #tokenize, convert to lower case for opinion in opinions: category = opinion.attrib['polarity'] train_tags.append(category) #store the category train_pos.append(t) #store the text for the pos tagging #caclulate score for each lexicon temp0 = self.checkLexiconReady(text, self.stopwords, capsList, self.negation, self.AFINN_lexicon) #afinn lexicon scores temp3 = self.checkLexiconReady(text, self.stopwords, capsList, self.negation, self.huLiu_lexicon) #Hu and Liu lexicon scores temp4 = self.checkLexiconReady(text, self.stopwords, capsList, self.negation, self.nrc_lexicon) #NRC lexicon scores temp1 = self.checkLexiconUni(text, chars_to_remove, self.stopwords, capsList, self.negation, self.train_unigram_lexicon) #unigram lexicon scores from the train data of each domain temp7 = self.howManyUpperFirst(t2) #num of words starting with capitalized first letter temp9 = [self.howMany(text, '?'), self.howMany(text, '!')] #number of question and exclamation marks temp11 = self.lastSymbol(t2) #is the last symbol a question or an exclamation mark cat = opinion.attrib['category'].split('#') #a feature for the entity and the attribute cat0 = [] for ent in self.entities: if ent == cat[0]: cat0.append(1) else: cat0.append(0) cat1 = [] for attr in self.attributes: if attr == cat[1]: cat1.append(1) else: cat1.append(0) temp12 = [len(opinions)] + cat0 + cat1 temp = temp0 + temp1 + temp3 + temp4 + temp7 + temp9 + temp11 + [capsCounter] + temp12 temp_vector.append(temp) #creating the features vector temp_vector = self.normalize(temp_vector) #normalize the vector pos = arktagger.pos_tag_list(train_pos) #getting the pos tags train_pos = self.howManyPos(pos) #calculating the number of the pos tags train_pos_bi = self.calcScorePosBi(pos) #caclulating the pos tags bigram scores for each text train_pos_bi = self.normalize(train_pos_bi) for i in range(len(temp_vector)): #join the matrices train_vector.append(temp_vector[i] + train_pos[i] + train_pos_bi[i]) print print '---- End of train ----' return train_vector,train_tags
plt.title(t) plt.legend(loc="upper right") plt.xlabel("sentiment score") plt.show() #read labels and messages from dataset dataset = "datasets/train15.tsv" #dataset = "datasets/training-set-sample.tsv" labels, messages = tsvreader.opentsv(dataset) ##labels = labels[0:100] ##messages = messages[0:100] #pos tags of messages tags = arktagger.pos_tag_list(messages) #initialize lists that hold the sentiment score of every message for every Lexicon socal_scores = [] minqinghu_scores = [] afinn_scores = [] nrc1_scores = [] nrc2_scores = [] nrc3_scores = [] nrc4_scores = [] nrc5_scores = [] mpqa_scores = [] swn_scores = [] #Lexicon objects
def main(messages_test): #tokenize all messages tokens_test = tokenize(messages_test) #compute pos tags for all messages pos_tags_test = arktagger.pos_tag_list(messages_test) #compute pos tag bigrams pos_bigrams_test = getBigrams(pos_tags_test) #compute pos tag trigrams pos_trigrams_test = getTrigrams(pos_tags_test) now = time.time() #load scores pos_tags_scores_neutral, pos_tags_scores_positive, pos_tags_scores_negative, pos_bigrams_scores_neutral, pos_bigrams_scores_positive, pos_bigrams_scores_negative, pos_trigrams_scores_neutral, pos_trigrams_scores_positive, pos_trigrams_scores_negative, mpqaScores = loadScores( ) #load lexicons negationList, slangDictionary, lexicons, mpqa_lexicons = loadLexiconsFromFile( ) #load clusters clusters = loadClustersFromFile() print "Resources loaded" #load Glove embeddings d = 25 glove = loadGlove(d) #Subjectivity Detection Features #SD1 features features_test_1 = features.getFeatures( messages_test, tokens_test, pos_tags_test, slangDictionary, lexicons, mpqa_lexicons, pos_bigrams_test, pos_trigrams_test, pos_bigrams_scores_negative, pos_bigrams_scores_positive, pos_trigrams_scores_negative, pos_trigrams_scores_positive, pos_tags_scores_negative, pos_tags_scores_positive, mpqaScores, negationList, clusters, pos_bigrams_scores_neutral, pos_trigrams_scores_neutral, pos_tags_scores_neutral) #SD2 features features_test_2 = [] for i in range(0, len(messages_test)): features_test_2.append(glove.findCentroid(tokens_test[i])) features_test_2 = np.array(features_test_2) #regularize features print "After Reg" features_test_1 = regularization.regularize(features_test_1) print features_test_1 features_test_2 = regularization.regularizeHorizontally(features_test_2) print features_test_2 #load SD classifiers with open('resources/sd_models.pkl', 'rb') as input: sd1 = pickle.load(input) sd2 = pickle.load(input) #get confidence scores test_confidence_1 = sd1.decision_function(features_test_1) test_confidence_2 = sd2.decision_function(features_test_2) #normalize confidence scores softmax = lambda x: 1 / (1. + math.exp(-x)) test_confidence_1 = [softmax(conf) for conf in test_confidence_1] test_confidence_2 = [softmax(conf) for conf in test_confidence_2] test_confidence_1 = np.array(test_confidence_1) test_confidence_2 = np.array(test_confidence_2) #Sentiment Polarity Features (append confidence scores to SD features) #SP1 features features_test_1 = np.hstack( (features_test_1, test_confidence_1.reshape(test_confidence_1.shape[0], 1))) #SP2 features features_test_2 = np.hstack( (features_test_2, test_confidence_2.reshape(test_confidence_2.shape[0], 1))) #load SP classifiers with open('resources/sp_models.pkl', 'rb') as input: sp1 = pickle.load(input) sp2 = pickle.load(input) #get confidence scores of every system confidence1 = sp1.decision_function(features_test_1) confidence2 = sp2.decision_function(features_test_2) for i in range(0, confidence1.shape[0]): for j in range(0, confidence1.shape[1]): confidence1[i][j] = softmax(confidence1[i][j]) for i in range(0, confidence2.shape[0]): for j in range(0, confidence2.shape[1]): confidence2[i][j] = softmax(confidence2[i][j]) #ensemble confidence scores with weight W W = 0.66 confidence = confidence1 * W + confidence2 * (1 - W) print "confidence" print confidence #get final prediction prediction = [np.argmax(x) - 1 for x in confidence] prediction = np.array(prediction) print "Prediction\n" for i in range(0, prediction.shape[0]): if prediction[i] == -1: pol = "Negative" elif prediction[i] == 0: pol = "Neutral" elif prediction[i] == 1: pol = "Positive" print "Message : " + messages_test[i] + "Polarity : " + pol + "\n" #accuracy and number of wrong line count_t = 0 num_f = [] num_f1 = [] num_f2 = [] num_f3 = [] num_f4 = [] num_f5 = [] num_f6 = [] senti_t = [] prediction_f = [] for j in range(0, senti.shape[0]): if senti[j] == prediction[j]: count_t = count_t + 1 else: num_f.append(j) senti_t.append(senti[j]) prediction_f.append(prediction[j]) print count_t * 100.00 / count plt.scatter(num_f, senti_t, c='r') plt.scatter(num_f, prediction_f, c='b') plt.show() #compare value of sentiment -1 0 1 for j in range(0, senti.shape[0]): if senti[j] == 1: if prediction[j] == 0: num_f1.append(j) elif prediction[j] == -1: num_f2.append(j) if senti[j] == 0: if prediction[j] == 1: num_f3.append(j) elif prediction[j] == -1: num_f4.append(j) if senti[j] == -1: if prediction[j] == 1: num_f5.append(j) elif prediction[j] == 0: num_f6.append(j) print num_f1, len(num_f1) print num_f2, len(num_f2) print num_f3, len(num_f3) print num_f4, len(num_f4) print num_f5, len(num_f5) print num_f6, len(num_f6)
def arkTags(messages): return arktagger.pos_tag_list(messages)
def main(messages_test): #tokenize all messages tokens_test = tokenize(messages_test) #compute pos tags for all messages pos_tags_test = arktagger.pos_tag_list(messages_test) #compute pos tag bigrams pos_bigrams_test = getBigrams(pos_tags_test) #compute pos tag trigrams pos_trigrams_test = getTrigrams(pos_tags_test) now = time.time() #load scores pos_tags_scores_neutral, pos_tags_scores_positive, pos_tags_scores_negative, pos_bigrams_scores_neutral, pos_bigrams_scores_positive, pos_bigrams_scores_negative, pos_trigrams_scores_neutral, pos_trigrams_scores_positive, pos_trigrams_scores_negative, mpqaScores = loadScores() #load lexicons negationList, slangDictionary, lexicons, mpqa_lexicons = loadLexiconsFromFile() #load clusters clusters = loadClustersFromFile() print "Resources loaded" #load Glove embeddings d = 200 glove = loadGlove(d) #Subjectivity Detection Features #SD1 features features_test_1 = features.getFeatures(messages_test,tokens_test,pos_tags_test,slangDictionary,lexicons,mpqa_lexicons,pos_bigrams_test,pos_trigrams_test,pos_bigrams_scores_negative,pos_bigrams_scores_positive,pos_trigrams_scores_negative,pos_trigrams_scores_positive,pos_tags_scores_negative,pos_tags_scores_positive,mpqaScores,negationList,clusters,pos_bigrams_scores_neutral,pos_trigrams_scores_neutral,pos_tags_scores_neutral) #SD2 features features_test_2=[] for i in range(0,len(messages_test)): features_test_2.append(glove.findCentroid(tokens_test[i])) features_test_2 = np.array(features_test_2) #regularize features features_test_1=regularization.regularize(features_test_1) features_test_2 = regularization.regularizeHorizontally(features_test_2) #load SD classifiers with open('resources/sd_models.pkl', 'rb') as input: sd1 = pickle.load(input) sd2 = pickle.load(input) #get confidence scores test_confidence_1 = sd1.decision_function(features_test_1) test_confidence_2 = sd2.decision_function(features_test_2) #normalize confidence scores softmax = lambda x: 1 / (1. + math.exp(-x)) test_confidence_1 = [softmax(conf) for conf in test_confidence_1] test_confidence_2 = [softmax(conf) for conf in test_confidence_2] test_confidence_1 = np.array(test_confidence_1) test_confidence_2 = np.array(test_confidence_2) #Sentiment Polarity Features (append confidence scores to SD features) #SP1 features features_test_1 = np.hstack((features_test_1,test_confidence_1.reshape(test_confidence_1.shape[0],1))) #SP2 features features_test_2 = np.hstack((features_test_2,test_confidence_2.reshape(test_confidence_2.shape[0],1))) #load SP classifiers with open('resources/sp_models.pkl', 'rb') as input: sp1 = pickle.load(input) sp2 = pickle.load(input) #get confidence scores of every system confidence1 = sp1.decision_function(features_test_1) confidence2 = sp2.decision_function(features_test_2) for i in range(0,confidence1.shape[0]): for j in range(0,confidence1.shape[1]): confidence1[i][j] = softmax(confidence1[i][j]) for i in range(0,confidence2.shape[0]): for j in range(0,confidence2.shape[1]): confidence2[i][j] = softmax(confidence2[i][j]) #ensemble confidence scores with weight W W=0.66 confidence = confidence1*W + confidence2*(1-W) #get final prediction prediction = [np.argmax(x)-1 for x in confidence] prediction = np.array(prediction) print "Prediction\n" for i in range(0, prediction.shape[0]): if prediction[i] == -1: pol = "Negative" elif prediction[i] == 0: pol = "Neutral" else: pol = "Positive" print "Message : " + messages_test[i]+"Polarity : "+pol+"\n"
def main(messages_test): #tokenize all messages tokens_test = tokenize(messages_test) #compute pos tags for all messages pos_tags_test = arktagger.pos_tag_list(messages_test) #compute pos tag bigrams pos_bigrams_test = getBigrams(pos_tags_test) #compute pos tag trigrams pos_trigrams_test = getTrigrams(pos_tags_test) now = time.time() #load scores pos_tags_scores_neutral, pos_tags_scores_positive, pos_tags_scores_negative, pos_bigrams_scores_neutral, pos_bigrams_scores_positive, pos_bigrams_scores_negative, pos_trigrams_scores_neutral, pos_trigrams_scores_positive, pos_trigrams_scores_negative, mpqaScores = loadScores( ) #load lexicons negationList, slangDictionary, lexicons, mpqa_lexicons = loadLexiconsFromFile( ) #load clusters clusters = loadClustersFromFile() print "Resources loaded" #load Glove embeddings d = 200 glove = loadGlove(d) #Subjectivity Detection Features #SD1 features features_test_1 = features.getFeatures( messages_test, tokens_test, pos_tags_test, slangDictionary, lexicons, mpqa_lexicons, pos_bigrams_test, pos_trigrams_test, pos_bigrams_scores_negative, pos_bigrams_scores_positive, pos_trigrams_scores_negative, pos_trigrams_scores_positive, pos_tags_scores_negative, pos_tags_scores_positive, mpqaScores, negationList, clusters, pos_bigrams_scores_neutral, pos_trigrams_scores_neutral, pos_tags_scores_neutral) #SD2 features features_test_2 = [] for i in range(0, len(messages_test)): features_test_2.append(glove.findCentroid(tokens_test[i])) features_test_2 = np.array(features_test_2) #regularize features features_test_1 = regularization.regularize(features_test_1) features_test_2 = regularization.regularizeHorizontally(features_test_2) #load SD classifiers with open('resources/sd_models.pkl', 'rb') as input: sd1 = pickle.load(input) sd2 = pickle.load(input) #get confidence scores test_confidence_1 = sd1.decision_function(features_test_1) test_confidence_2 = sd2.decision_function(features_test_2) #normalize confidence scores softmax = lambda x: 1 / (1. + math.exp(-x)) test_confidence_1 = [softmax(conf) for conf in test_confidence_1] test_confidence_2 = [softmax(conf) for conf in test_confidence_2] test_confidence_1 = np.array(test_confidence_1) test_confidence_2 = np.array(test_confidence_2) #Sentiment Polarity Features (append confidence scores to SD features) #SP1 features features_test_1 = np.hstack( (features_test_1, test_confidence_1.reshape(test_confidence_1.shape[0], 1))) #SP2 features features_test_2 = np.hstack( (features_test_2, test_confidence_2.reshape(test_confidence_2.shape[0], 1))) #load SP classifiers with open('resources/sp_models.pkl', 'rb') as input: sp1 = pickle.load(input) sp2 = pickle.load(input) #get confidence scores of every system confidence1 = sp1.decision_function(features_test_1) confidence2 = sp2.decision_function(features_test_2) for i in range(0, confidence1.shape[0]): for j in range(0, confidence1.shape[1]): confidence1[i][j] = softmax(confidence1[i][j]) for i in range(0, confidence2.shape[0]): for j in range(0, confidence2.shape[1]): confidence2[i][j] = softmax(confidence2[i][j]) #ensemble confidence scores with weight W W = 0.66 confidence = confidence1 * W + confidence2 * (1 - W) #get final prediction prediction = [np.argmax(x) - 1 for x in confidence] prediction = np.array(prediction) print "Prediction\n" for i in range(0, prediction.shape[0]): if prediction[i] == -1: pol = "Negative" elif prediction[i] == 0: pol = "Neutral" else: pol = "Positive" print "Message : " + messages_test[i] + "Polarity : " + pol + "\n"
def main(f): print "System training started" #load training dataset dataset_train = f ids, labels_train, messages_train = tsvreader.opentsv(dataset_train) print "Train data loaded" #labels for subjectivity detection (2 categories) temp_labels_train = [0 if x == "neutral" else 1 for x in labels_train] #labels for polarity detection (3 categories) labels_train = [ 0 if x == "neutral" else -1 if x == "negative" else 1 for x in labels_train ] #convert labels to numpy arrays temp_labels_train = np.array(temp_labels_train) labels_train = np.array(labels_train) #load word clusters clusters = loadClusters() print "Clusters loaded" #load Lexicons negationList, slangDictionary, lexicons, mpqa_lexicons = loadLexicons() print "Lexicons loaded" #tokenize all messages tokens_train = tokenize(messages_train) print "Messages tokenized" #compute pos tags for all messages pos_tags_train = arktagger.pos_tag_list(messages_train) print "Pos tags computed" #compute pos tag bigrams pos_bigrams_train = getBigrams(pos_tags_train) #compute pos tag trigrams pos_trigrams_train = getTrigrams(pos_tags_train) #get the unique pos bigrams from training set unique_pos_tags = getPosTagsSet(pos_tags_train) unique_bigrams = getBigramsSet(pos_bigrams_train) unique_trigrams = getTrigramsSet(pos_trigrams_train) #compute POS tag scores pos_tags_scores_neutral = posTagsScore(unique_pos_tags, 0, pos_tags_train, labels_train) pos_tags_scores_positive = posTagsScore(unique_pos_tags, 1, pos_tags_train, labels_train) pos_tags_scores_negative = posTagsScore(unique_pos_tags, -1, pos_tags_train, labels_train) pos_bigrams_scores_neutral = posBigramsScore(unique_bigrams, 0, pos_bigrams_train, labels_train) pos_bigrams_scores_positive = posBigramsScore(unique_bigrams, 1, pos_bigrams_train, labels_train) pos_bigrams_scores_negative = posBigramsScore(unique_bigrams, -1, pos_bigrams_train, labels_train) pos_trigrams_scores_neutral = posTrigramsScore(unique_trigrams, 0, pos_trigrams_train, labels_train) pos_trigrams_scores_positive = posTrigramsScore(unique_trigrams, 1, pos_trigrams_train, labels_train) pos_trigrams_scores_negative = posTrigramsScore(unique_trigrams, -1, pos_trigrams_train, labels_train) #compute mpqa scores mpqaScores = getScores(mpqa_lexicons, messages_train, labels_train, neutral=True) #save scores and other resources for future use savePosScores(pos_tags_scores_neutral, pos_tags_scores_positive, pos_tags_scores_negative, pos_bigrams_scores_neutral, pos_bigrams_scores_positive, pos_bigrams_scores_negative, pos_trigrams_scores_neutral, pos_trigrams_scores_positive, pos_trigrams_scores_negative, mpqaScores) #save lexicons saveLexicons(negationList, slangDictionary, lexicons, mpqa_lexicons) #save clusters saveClusters(clusters) #load Glove embeddings d = 200 glove = GloveDictionary.Glove(d) #save Glove embeddings for future use saveGlove(glove) #Subjectivity Detection Features #SD1 features features_train_1 = features.getFeatures( messages_train, tokens_train, pos_tags_train, slangDictionary, lexicons, mpqa_lexicons, pos_bigrams_train, pos_trigrams_train, pos_bigrams_scores_negative, pos_bigrams_scores_positive, pos_trigrams_scores_negative, pos_trigrams_scores_positive, pos_tags_scores_negative, pos_tags_scores_positive, mpqaScores, negationList, clusters, pos_bigrams_scores_neutral, pos_trigrams_scores_neutral, pos_tags_scores_neutral) #SD2 features features_train_2 = [] #for message in tokens_train : for i in range(0, len(messages_train)): features_train_2.append(glove.findCentroid(tokens_train[i])) features_train_2 = np.array(features_train_2) #regularize features features_train_1 = regularization.regularize(features_train_1) features_train_2 = regularization.regularizeHorizontally(features_train_2) #Penalty parameter C of the error term for every SD system C1 = 0.001953125 C2 = 1.4068830572470667 #get confidence scores train_confidence_1 = getConfidenceScores(features_train_1, temp_labels_train, C1) train_confidence_2 = getConfidenceScores(features_train_2, temp_labels_train, C2) #normalize confidence scores softmax = lambda x: 1 / (1. + math.exp(-x)) train_confidence_1 = [softmax(conf) for conf in train_confidence_1] train_confidence_2 = [softmax(conf) for conf in train_confidence_2] train_confidence_1 = np.array(train_confidence_1) train_confidence_2 = np.array(train_confidence_2) #train SD classifiers sd1 = SVM.train(features_train_1, temp_labels_train, c=C1, k="linear") sd2 = SVM.train(features_train_2, temp_labels_train, c=C2, k="linear") #Sentiment Polarity Features (append confidence scores to SD features) #SP1 features features_train_1 = np.hstack( (features_train_1, train_confidence_1.reshape(train_confidence_1.shape[0], 1))) #SP1 features features_train_2 = np.hstack( (features_train_2, train_confidence_2.reshape(train_confidence_2.shape[0], 1))) #Penalty parameter C of the error term for every SP system C1 = 0.003410871889693192 C2 = 7.396183688299606 #train SP classifiers sp1 = SVM.train(features_train_1, labels_train, c=C1, k="linear") sp2 = SVM.train(features_train_2, labels_train, c=C2, k="linear") #save trained models saveModels(sd1, sd2, sp1, sp2) print "System training completed!"
def main(f): print "System training started" #load training dataset dataset_train = f ids,labels_train,messages_train=tsvreader.opentsv(dataset_train) print "Train data loaded" #labels for subjectivity detection (2 categories) temp_labels_train = [0 if x=="neutral" else 1 for x in labels_train] #labels for polarity detection (3 categories) labels_train = [0 if x=="neutral" else -1 if x =="negative" else 1 for x in labels_train] #convert labels to numpy arrays temp_labels_train=np.array(temp_labels_train) labels_train=np.array(labels_train) #load word clusters clusters = loadClusters() print "Clusters loaded" #load Lexicons negationList, slangDictionary, lexicons, mpqa_lexicons = loadLexicons() print "Lexicons loaded" #tokenize all messages tokens_train = tokenize(messages_train) print "Messages tokenized" #compute pos tags for all messages pos_tags_train = arktagger.pos_tag_list(messages_train) print "Pos tags computed" #compute pos tag bigrams pos_bigrams_train = getBigrams(pos_tags_train) #compute pos tag trigrams pos_trigrams_train = getTrigrams(pos_tags_train) #get the unique pos bigrams from training set unique_pos_tags = getPosTagsSet(pos_tags_train) unique_bigrams = getBigramsSet(pos_bigrams_train) unique_trigrams= getTrigramsSet(pos_trigrams_train) #compute POS tag scores pos_tags_scores_neutral = posTagsScore(unique_pos_tags,0,pos_tags_train,labels_train) pos_tags_scores_positive = posTagsScore(unique_pos_tags,1,pos_tags_train,labels_train) pos_tags_scores_negative = posTagsScore(unique_pos_tags,-1,pos_tags_train,labels_train) pos_bigrams_scores_neutral = posBigramsScore(unique_bigrams,0,pos_bigrams_train,labels_train) pos_bigrams_scores_positive = posBigramsScore(unique_bigrams,1,pos_bigrams_train,labels_train) pos_bigrams_scores_negative = posBigramsScore(unique_bigrams,-1,pos_bigrams_train,labels_train) pos_trigrams_scores_neutral = posTrigramsScore(unique_trigrams,0,pos_trigrams_train,labels_train) pos_trigrams_scores_positive = posTrigramsScore(unique_trigrams,1,pos_trigrams_train,labels_train) pos_trigrams_scores_negative = posTrigramsScore(unique_trigrams,-1,pos_trigrams_train,labels_train) #compute mpqa scores mpqaScores = getScores(mpqa_lexicons,messages_train,labels_train,neutral=True) #save scores and other resources for future use savePosScores(pos_tags_scores_neutral, pos_tags_scores_positive,pos_tags_scores_negative,pos_bigrams_scores_neutral,pos_bigrams_scores_positive,pos_bigrams_scores_negative,pos_trigrams_scores_neutral,pos_trigrams_scores_positive,pos_trigrams_scores_negative,mpqaScores) #save lexicons saveLexicons(negationList,slangDictionary,lexicons,mpqa_lexicons) #save clusters saveClusters(clusters) #load Glove embeddings d = 200 glove = GloveDictionary.Glove(d) #save Glove embeddings for future use saveGlove(glove) #Subjectivity Detection Features #SD1 features features_train_1 = features.getFeatures(messages_train,tokens_train,pos_tags_train,slangDictionary,lexicons,mpqa_lexicons,pos_bigrams_train,pos_trigrams_train,pos_bigrams_scores_negative,pos_bigrams_scores_positive,pos_trigrams_scores_negative,pos_trigrams_scores_positive,pos_tags_scores_negative,pos_tags_scores_positive,mpqaScores,negationList,clusters,pos_bigrams_scores_neutral,pos_trigrams_scores_neutral,pos_tags_scores_neutral) #SD2 features features_train_2 = [] #for message in tokens_train : for i in range(0,len(messages_train)): features_train_2.append(glove.findCentroid(tokens_train[i])) features_train_2 = np.array(features_train_2) #regularize features features_train_1 = regularization.regularize(features_train_1) features_train_2 = regularization.regularizeHorizontally(features_train_2) #Penalty parameter C of the error term for every SD system C1=0.001953125 C2=1.4068830572470667 #get confidence scores train_confidence_1 = getConfidenceScores(features_train_1, temp_labels_train, C1) train_confidence_2 = getConfidenceScores(features_train_2, temp_labels_train, C2) #normalize confidence scores softmax = lambda x: 1 / (1. + math.exp(-x)) train_confidence_1 = [softmax(conf) for conf in train_confidence_1] train_confidence_2 = [softmax(conf) for conf in train_confidence_2] train_confidence_1 = np.array(train_confidence_1) train_confidence_2 = np.array(train_confidence_2) #train SD classifiers sd1 = SVM.train(features_train_1,temp_labels_train,c=C1,k="linear") sd2 = SVM.train(features_train_2,temp_labels_train,c=C2,k="linear") #Sentiment Polarity Features (append confidence scores to SD features) #SP1 features features_train_1 = np.hstack((features_train_1,train_confidence_1.reshape(train_confidence_1.shape[0],1))) #SP1 features features_train_2 = np.hstack((features_train_2,train_confidence_2.reshape(train_confidence_2.shape[0],1))) #Penalty parameter C of the error term for every SP system C1=0.003410871889693192 C2=7.396183688299606 #train SP classifiers sp1 = SVM.train(features_train_1,labels_train,c=C1,k="linear") sp2 = SVM.train(features_train_2,labels_train,c=C2,k="linear") #save trained models saveModels(sd1,sd2,sp1,sp2) print "System training completed!"
labels_train = [0 if x=="negative" else 1 for x in labels_train] labels_test = [0 if x=="negative" else 1 for x in labels_test] #tokenize all messages tokens_train = tokenize(messages_train) tokens_test = tokenize(messages_test) #initialize glove lexicon glove = GloveDictionary.Glove() #dictionary = enchant.Dict("en_US") #slangDictionary = Slang.Slang() pos_tags_train = arktagger.pos_tag_list(messages_train) pos_tags_test = arktagger.pos_tag_list(messages_test) ##messages_train = preprocessMessages(messages_train,tokens_train,pos_tags_train,slangDictionary,dictionary) ##messages_test = preprocessMessages(messages_test,tokens_test,pos_tags_test,slangDictionary,dictionary) ## ##tokens_train=tokenize(messages_train) ##tokens_test=tokenize(messages_test) ## ###compute pos tags for all preprocessed messages ##pos_tags_train = arktagger.pos_tag_list(messages_train) ##pos_tags_test = arktagger.pos_tag_list(messages_test) print("glove initialized ... " )
def test(self,dom): test_pos = [] temp_vector = [] test_vector = [] chars_to_remove = ['=', '!', '?', ',', '<', '.', '>', '/', ';' ,':', ']', '}', '[', '{', '|', '@', '$', '%', '^', '&', '*', '(', ')', '_', '-', '+', '+', '"','1', '2', '3','4','5','6','7','8','9' ] #removing characters reviews = ET.parse(self.test_path).getroot().findall('Review') for review in reviews: sentences = review[0] #get the sentences for sentence in sentences: if (len(sentence) > 1): opinions = sentence[1] if ( len(opinions) > 0): #check if there are aspects t = sentence[0].text t2 = word_tokenize(t) capsList, capsCounter = self.listCaps(t2) #storing the caps words of the text text = word_tokenize(t.lower()) for opinion in opinions: test_pos.append(t) #calculate score for each lexicon temp0 = self.checkLexiconReady(text, self.stopwords, capsList, self.negation, self.AFINN_lexicon) #Afinn lexicon scores temp3 = self.checkLexiconReady(text, self.stopwords, capsList, self.negation, self.huLiu_lexicon) #Hu and Liu lexicon scores temp4 = self.checkLexiconReady(text, self.stopwords, capsList, self.negation, self.nrc_lexicon) #NRC lexicon scores temp1 = self.checkLexiconUni(text, chars_to_remove, self.stopwords, capsList, self.negation, self.train_unigram_lexicon) #unigram lexicon scores from the train data of each domain temp7 = self.howManyUpperFirst(t2) #num of words starting with capitalized first letter temp9 = [self.howMany(text, '?'), self.howMany(text, '!')] #number of question and exclamation marks temp11 = self.lastSymbol(t2) #is the last symbol a question or an exclamation mark cat = opinion.attrib['category'].split('#') #a feature for the entity and the attribute cat0 = [] for ent in self.entities: if ent == cat[0]: cat0.append(1) else: cat0.append(0) cat1 = [] for attr in self.attributes: if attr == cat[1]: cat1.append(1) else: cat1.append(0) temp12 = [len(opinions)] + cat0 + cat1 temp = temp0 + temp1 + temp3 + temp4 + temp7 + temp9 + temp11 + [capsCounter] + temp12 temp_vector.append(temp) #creating the features vector temp_vector = self.normalize(temp_vector) #normalize the vector pos = arktagger.pos_tag_list(test_pos) #finding the pos tags test_pos = self.howManyPos(pos) test_pos_bi = self.calcScorePosBi(pos) test_pos_bi = self.normalize(test_pos_bi) for i in range(len(temp_vector)): #join the matrices test_vector.append(temp_vector[i] + test_pos[i] + test_pos_bi[i]) print print '---- End of Test ----' return test_vector