コード例 #1
0
	def getTestVector(self, sentence, opinionCategories, dom):
		test_pos = []
		temp_vector = []
		test_vector = []

		chars_to_remove = ['=', '!', '?', ',', '<', '.', '>', '/', ';' ,':', ']', '}', '[', '{', '|', '@', '$', '%', '^', '&', '*', '(', ')', '_', '-', '+', '+', '"','1', '2', '3','4','5','6','7','8','9' ] #removing characters

		t2 = word_tokenize(sentence)
		capsList, capsCounter = self.listCaps(t2) #storing the caps words of the text
		text = word_tokenize(sentence.lower())
		
		for opinionCategory in opinionCategories:
			test_pos.append(sentence)
			
			#calculate score for each lexicon
			temp0 = self.checkLexiconReady(text, self.stopwords, capsList, self.negation, self.AFINN_lexicon) #Afinn lexicon scores
			temp3 = self.checkLexiconReady(text, self.stopwords, capsList, self.negation, self.huLiu_lexicon) #Hu and Liu lexicon scores
			temp4 = self.checkLexiconReady(text, self.stopwords, capsList, self.negation, self.nrc_lexicon) #NRC lexicon scores
			
			temp1 = self.checkLexiconUni(text, chars_to_remove, self.stopwords, capsList, self.negation, self.train_unigram_lexicon) #unigram lexicon scores from the train data of each domain
			
			temp7 = self.howManyUpperFirst(t2) #num of words starting with capitalized first letter
			temp9 = [self.howMany(text, '?'), self.howMany(text, '!')] #number of question and exclamation marks
			temp11 = self.lastSymbol(t2) #is the last symbol a question or an exclamation mark
									
			cat = opinionCategory.split('#') #a feature for the entity and the attribute
			cat0 = []
			for ent in self.entities:
				if ent == cat[0]:
					cat0.append(1)
				else:
					cat0.append(0)
			cat1 = []
			for attr in self.attributes:
				if attr == cat[1]:
					cat1.append(1)
				else:
					cat1.append(0)
			temp12 = [len(opinionCategories)] + cat0 + cat1
			
			temp = temp0 + temp1 + temp3 + temp4 + temp7 + temp9 + temp11 + [capsCounter] + temp12
			temp_vector.append(temp) #creating the features vector
		  
		temp_vector = self.normalize(temp_vector) #normalize the vector              
						
		pos = arktagger.pos_tag_list(test_pos) #finding the pos tags            
		test_pos = self.howManyPos(pos)

		test_pos_bi = self.calcScorePosBi(pos)
		test_pos_bi = self.normalize(test_pos_bi)

		for i in range(len(temp_vector)): #join the matrices
			test_vector.append(temp_vector[i] + test_pos[i] + test_pos_bi[i])

		#print
		#print '---- End of Test ----'
		return test_vector
コード例 #2
0
	def train(self,dom):
		
		temp_vector = []
		train_tags = []
		train_pos = []
		train_vector = []

		chars_to_remove = ['=', '!', '?', ',', '<', '.', '>', '/', ';' ,':', ']', '}', '[', '{', '|', '@', '$', '%', '^', '&', '*', '(', ')', '_', '-', '+', '+', '"','1', '2', '3','4','5','6','7','8','9' ] #removing characters

		reviews = ET.parse(self.train_path).getroot().findall('Review')
		for review in reviews:
			sentences = review[0] #get the sentences
			for sentence in sentences:
				if (len(sentence) > 1):
					opinions = sentence[1] #getting the opinions field
					if ( len(opinions) > 0): #check if there are aspects 
						
						t = sentence[0].text
						t2 = word_tokenize(t) #tokenize, don't convert to lower case, check for caps
						capsList, capsCounter = self.listCaps(t2) #storing the caps words of the text
						text = word_tokenize(t.lower()) #tokenize, convert to lower case
						
						for opinion in opinions: 
							category = opinion.attrib['polarity']    
							train_tags.append(category) #store the category
							train_pos.append(t) #store the text for the pos tagging 

							#caclulate score for each lexicon
							temp0 = self.checkLexiconReady(text, self.stopwords, capsList, self.negation, self.AFINN_lexicon) #afinn lexicon scores
							temp3 = self.checkLexiconReady(text, self.stopwords, capsList, self.negation, self.huLiu_lexicon) #Hu and Liu lexicon scores
							temp4 = self.checkLexiconReady(text, self.stopwords, capsList, self.negation, self.nrc_lexicon) #NRC lexicon scores
							
							temp1 = self.checkLexiconUni(text, chars_to_remove, self.stopwords, capsList, self.negation, self.train_unigram_lexicon) #unigram lexicon scores from the train data of each domain

							temp7 = self.howManyUpperFirst(t2) #num of words starting with capitalized first letter
							temp9 = [self.howMany(text, '?'), self.howMany(text, '!')] #number of question and exclamation marks
							temp11 = self.lastSymbol(t2) #is the last symbol a question or an exclamation mark
							
							cat = opinion.attrib['category'].split('#') #a feature for the entity and the attribute
							cat0 = []
							for ent in self.entities:
								if ent == cat[0]:
									cat0.append(1)
								else:
									cat0.append(0)
							cat1 = []
							for attr in self.attributes:
								if attr == cat[1]:
									cat1.append(1)
								else:
									cat1.append(0)
							temp12 = [len(opinions)] + cat0 + cat1

							temp = temp0 + temp1 + temp3 + temp4 + temp7 + temp9 + temp11 + [capsCounter] + temp12
							temp_vector.append(temp) #creating the features vector

		temp_vector = self.normalize(temp_vector) #normalize the vector

		pos = arktagger.pos_tag_list(train_pos) #getting the pos tags
		train_pos = self.howManyPos(pos) #calculating the number of the pos tags

		train_pos_bi = self.calcScorePosBi(pos) #caclulating the pos tags bigram scores for each text
		train_pos_bi = self.normalize(train_pos_bi)

		for i in range(len(temp_vector)): #join the matrices
			train_vector.append(temp_vector[i] + train_pos[i] + train_pos_bi[i])

		print
		print '---- End of train ----'

		return train_vector,train_tags
コード例 #3
0
    plt.title(t)
    plt.legend(loc="upper right")
    plt.xlabel("sentiment score")

    plt.show()
    
#read labels and messages from dataset
dataset = "datasets/train15.tsv"
#dataset = "datasets/training-set-sample.tsv"
labels, messages = tsvreader.opentsv(dataset)

##labels = labels[0:100]
##messages = messages[0:100]

#pos tags of messages
tags = arktagger.pos_tag_list(messages)

#initialize lists that hold the sentiment score of every message for every Lexicon
socal_scores = []
minqinghu_scores = []
afinn_scores = []
nrc1_scores = []
nrc2_scores = []
nrc3_scores = []
nrc4_scores = []
nrc5_scores = []
mpqa_scores = []
swn_scores = []

#Lexicon objects
コード例 #4
0
def main(messages_test):
    #tokenize all messages
    tokens_test = tokenize(messages_test)
    #compute pos tags for all messages
    pos_tags_test = arktagger.pos_tag_list(messages_test)
    #compute pos tag bigrams
    pos_bigrams_test = getBigrams(pos_tags_test)
    #compute pos tag trigrams
    pos_trigrams_test = getTrigrams(pos_tags_test)

    now = time.time()

    #load scores
    pos_tags_scores_neutral, pos_tags_scores_positive, pos_tags_scores_negative, pos_bigrams_scores_neutral, pos_bigrams_scores_positive, pos_bigrams_scores_negative, pos_trigrams_scores_neutral, pos_trigrams_scores_positive, pos_trigrams_scores_negative, mpqaScores = loadScores(
    )

    #load lexicons
    negationList, slangDictionary, lexicons, mpqa_lexicons = loadLexiconsFromFile(
    )

    #load clusters
    clusters = loadClustersFromFile()

    print "Resources loaded"

    #load Glove embeddings
    d = 25
    glove = loadGlove(d)

    #Subjectivity Detection Features

    #SD1 features
    features_test_1 = features.getFeatures(
        messages_test, tokens_test, pos_tags_test, slangDictionary, lexicons,
        mpqa_lexicons, pos_bigrams_test, pos_trigrams_test,
        pos_bigrams_scores_negative, pos_bigrams_scores_positive,
        pos_trigrams_scores_negative, pos_trigrams_scores_positive,
        pos_tags_scores_negative, pos_tags_scores_positive, mpqaScores,
        negationList, clusters, pos_bigrams_scores_neutral,
        pos_trigrams_scores_neutral, pos_tags_scores_neutral)

    #SD2 features
    features_test_2 = []
    for i in range(0, len(messages_test)):
        features_test_2.append(glove.findCentroid(tokens_test[i]))

    features_test_2 = np.array(features_test_2)

    #regularize features
    print "After Reg"
    features_test_1 = regularization.regularize(features_test_1)

    print features_test_1
    features_test_2 = regularization.regularizeHorizontally(features_test_2)

    print features_test_2

    #load SD classifiers
    with open('resources/sd_models.pkl', 'rb') as input:
        sd1 = pickle.load(input)
        sd2 = pickle.load(input)

    #get confidence scores
    test_confidence_1 = sd1.decision_function(features_test_1)
    test_confidence_2 = sd2.decision_function(features_test_2)

    #normalize confidence scores
    softmax = lambda x: 1 / (1. + math.exp(-x))
    test_confidence_1 = [softmax(conf) for conf in test_confidence_1]
    test_confidence_2 = [softmax(conf) for conf in test_confidence_2]

    test_confidence_1 = np.array(test_confidence_1)
    test_confidence_2 = np.array(test_confidence_2)

    #Sentiment Polarity Features (append confidence scores to SD features)

    #SP1 features
    features_test_1 = np.hstack(
        (features_test_1,
         test_confidence_1.reshape(test_confidence_1.shape[0], 1)))

    #SP2 features
    features_test_2 = np.hstack(
        (features_test_2,
         test_confidence_2.reshape(test_confidence_2.shape[0], 1)))

    #load SP classifiers
    with open('resources/sp_models.pkl', 'rb') as input:
        sp1 = pickle.load(input)
        sp2 = pickle.load(input)

    #get confidence scores of every system
    confidence1 = sp1.decision_function(features_test_1)
    confidence2 = sp2.decision_function(features_test_2)

    for i in range(0, confidence1.shape[0]):
        for j in range(0, confidence1.shape[1]):
            confidence1[i][j] = softmax(confidence1[i][j])

    for i in range(0, confidence2.shape[0]):
        for j in range(0, confidence2.shape[1]):
            confidence2[i][j] = softmax(confidence2[i][j])

    #ensemble confidence scores with weight W
    W = 0.66

    confidence = confidence1 * W + confidence2 * (1 - W)
    print "confidence"
    print confidence

    #get final prediction
    prediction = [np.argmax(x) - 1 for x in confidence]

    prediction = np.array(prediction)

    print "Prediction\n"
    for i in range(0, prediction.shape[0]):
        if prediction[i] == -1:
            pol = "Negative"
        elif prediction[i] == 0:
            pol = "Neutral"
        elif prediction[i] == 1:
            pol = "Positive"
        print "Message : " + messages_test[i] + "Polarity : " + pol + "\n"

#accuracy and number of wrong line
    count_t = 0
    num_f = []
    num_f1 = []
    num_f2 = []
    num_f3 = []
    num_f4 = []
    num_f5 = []
    num_f6 = []
    senti_t = []
    prediction_f = []
    for j in range(0, senti.shape[0]):
        if senti[j] == prediction[j]:
            count_t = count_t + 1

        else:
            num_f.append(j)
            senti_t.append(senti[j])
            prediction_f.append(prediction[j])

    print count_t * 100.00 / count
    plt.scatter(num_f, senti_t, c='r')
    plt.scatter(num_f, prediction_f, c='b')
    plt.show()

    #compare value of sentiment -1 0 1
    for j in range(0, senti.shape[0]):
        if senti[j] == 1:
            if prediction[j] == 0:
                num_f1.append(j)
            elif prediction[j] == -1:
                num_f2.append(j)
        if senti[j] == 0:
            if prediction[j] == 1:
                num_f3.append(j)
            elif prediction[j] == -1:
                num_f4.append(j)
        if senti[j] == -1:
            if prediction[j] == 1:
                num_f5.append(j)
            elif prediction[j] == 0:
                num_f6.append(j)

    print num_f1, len(num_f1)
    print num_f2, len(num_f2)
    print num_f3, len(num_f3)
    print num_f4, len(num_f4)
    print num_f5, len(num_f5)
    print num_f6, len(num_f6)
コード例 #5
0
def arkTags(messages):
    return arktagger.pos_tag_list(messages)
コード例 #6
0
def main(messages_test):
        #tokenize all messages
	tokens_test = tokenize(messages_test)
	#compute pos tags for all messages
	pos_tags_test = arktagger.pos_tag_list(messages_test)
	#compute pos tag bigrams
	pos_bigrams_test = getBigrams(pos_tags_test)
	#compute pos tag trigrams
	pos_trigrams_test = getTrigrams(pos_tags_test)

	now = time.time()

	#load scores
	pos_tags_scores_neutral, pos_tags_scores_positive, pos_tags_scores_negative, pos_bigrams_scores_neutral, pos_bigrams_scores_positive, pos_bigrams_scores_negative, pos_trigrams_scores_neutral, pos_trigrams_scores_positive, pos_trigrams_scores_negative, mpqaScores = loadScores()
	
	#load lexicons
	negationList, slangDictionary, lexicons, mpqa_lexicons = loadLexiconsFromFile()
	
	#load clusters
	clusters = loadClustersFromFile()
		
	print "Resources loaded"
	
	#load Glove embeddings
	d = 200
	glove = loadGlove(d)
		
	#Subjectivity Detection Features
	
	#SD1 features
	features_test_1 = features.getFeatures(messages_test,tokens_test,pos_tags_test,slangDictionary,lexicons,mpqa_lexicons,pos_bigrams_test,pos_trigrams_test,pos_bigrams_scores_negative,pos_bigrams_scores_positive,pos_trigrams_scores_negative,pos_trigrams_scores_positive,pos_tags_scores_negative,pos_tags_scores_positive,mpqaScores,negationList,clusters,pos_bigrams_scores_neutral,pos_trigrams_scores_neutral,pos_tags_scores_neutral)
	
	#SD2 features
	features_test_2=[]
	for i in range(0,len(messages_test)):
		features_test_2.append(glove.findCentroid(tokens_test[i]))

	features_test_2 = np.array(features_test_2)

	#regularize features
	features_test_1=regularization.regularize(features_test_1)
	features_test_2 = regularization.regularizeHorizontally(features_test_2)
	
	#load SD classifiers
	with open('resources/sd_models.pkl', 'rb') as input:
		sd1 = pickle.load(input)
		sd2 = pickle.load(input)
		
	#get confidence scores
	test_confidence_1 = sd1.decision_function(features_test_1)
	test_confidence_2 = sd2.decision_function(features_test_2)

	#normalize confidence scores
	softmax = lambda x: 1 / (1. + math.exp(-x))
	test_confidence_1 = [softmax(conf) for conf in test_confidence_1]
	test_confidence_2 = [softmax(conf) for conf in test_confidence_2]
	
	test_confidence_1 = np.array(test_confidence_1)
	test_confidence_2 = np.array(test_confidence_2)

	#Sentiment Polarity Features (append confidence scores to SD features)
	
	#SP1 features
	features_test_1 = np.hstack((features_test_1,test_confidence_1.reshape(test_confidence_1.shape[0],1)))
	#SP2 features
	features_test_2 = np.hstack((features_test_2,test_confidence_2.reshape(test_confidence_2.shape[0],1)))

	#load SP classifiers
	with open('resources/sp_models.pkl', 'rb') as input:
		sp1 = pickle.load(input)
		sp2 = pickle.load(input)
		
	#get confidence scores of every system
	confidence1 = sp1.decision_function(features_test_1)
	confidence2 = sp2.decision_function(features_test_2)

	for i in range(0,confidence1.shape[0]):
		for j in range(0,confidence1.shape[1]):
			confidence1[i][j] = softmax(confidence1[i][j])

	for i in range(0,confidence2.shape[0]):
		for j in range(0,confidence2.shape[1]):
			confidence2[i][j] = softmax(confidence2[i][j])

	#ensemble confidence scores with weight W
	W=0.66

	confidence = confidence1*W + confidence2*(1-W)

	#get final prediction
	prediction = [np.argmax(x)-1 for x in confidence]
	prediction = np.array(prediction)

	print "Prediction\n"
	for i in range(0, prediction.shape[0]):
		if prediction[i] == -1:
			pol = "Negative"
		elif prediction[i] == 0:
			pol = "Neutral"
		else:
			pol = "Positive"
                print "Message : " + messages_test[i]+"Polarity : "+pol+"\n"
コード例 #7
0
def main(messages_test):
    #tokenize all messages
    tokens_test = tokenize(messages_test)
    #compute pos tags for all messages
    pos_tags_test = arktagger.pos_tag_list(messages_test)
    #compute pos tag bigrams
    pos_bigrams_test = getBigrams(pos_tags_test)
    #compute pos tag trigrams
    pos_trigrams_test = getTrigrams(pos_tags_test)

    now = time.time()

    #load scores
    pos_tags_scores_neutral, pos_tags_scores_positive, pos_tags_scores_negative, pos_bigrams_scores_neutral, pos_bigrams_scores_positive, pos_bigrams_scores_negative, pos_trigrams_scores_neutral, pos_trigrams_scores_positive, pos_trigrams_scores_negative, mpqaScores = loadScores(
    )

    #load lexicons
    negationList, slangDictionary, lexicons, mpqa_lexicons = loadLexiconsFromFile(
    )

    #load clusters
    clusters = loadClustersFromFile()

    print "Resources loaded"

    #load Glove embeddings
    d = 200
    glove = loadGlove(d)

    #Subjectivity Detection Features

    #SD1 features
    features_test_1 = features.getFeatures(
        messages_test, tokens_test, pos_tags_test, slangDictionary, lexicons,
        mpqa_lexicons, pos_bigrams_test, pos_trigrams_test,
        pos_bigrams_scores_negative, pos_bigrams_scores_positive,
        pos_trigrams_scores_negative, pos_trigrams_scores_positive,
        pos_tags_scores_negative, pos_tags_scores_positive, mpqaScores,
        negationList, clusters, pos_bigrams_scores_neutral,
        pos_trigrams_scores_neutral, pos_tags_scores_neutral)

    #SD2 features
    features_test_2 = []
    for i in range(0, len(messages_test)):
        features_test_2.append(glove.findCentroid(tokens_test[i]))

    features_test_2 = np.array(features_test_2)

    #regularize features
    features_test_1 = regularization.regularize(features_test_1)
    features_test_2 = regularization.regularizeHorizontally(features_test_2)

    #load SD classifiers
    with open('resources/sd_models.pkl', 'rb') as input:
        sd1 = pickle.load(input)
        sd2 = pickle.load(input)

    #get confidence scores
    test_confidence_1 = sd1.decision_function(features_test_1)
    test_confidence_2 = sd2.decision_function(features_test_2)

    #normalize confidence scores
    softmax = lambda x: 1 / (1. + math.exp(-x))
    test_confidence_1 = [softmax(conf) for conf in test_confidence_1]
    test_confidence_2 = [softmax(conf) for conf in test_confidence_2]

    test_confidence_1 = np.array(test_confidence_1)
    test_confidence_2 = np.array(test_confidence_2)

    #Sentiment Polarity Features (append confidence scores to SD features)

    #SP1 features
    features_test_1 = np.hstack(
        (features_test_1,
         test_confidence_1.reshape(test_confidence_1.shape[0], 1)))
    #SP2 features
    features_test_2 = np.hstack(
        (features_test_2,
         test_confidence_2.reshape(test_confidence_2.shape[0], 1)))

    #load SP classifiers
    with open('resources/sp_models.pkl', 'rb') as input:
        sp1 = pickle.load(input)
        sp2 = pickle.load(input)

    #get confidence scores of every system
    confidence1 = sp1.decision_function(features_test_1)
    confidence2 = sp2.decision_function(features_test_2)

    for i in range(0, confidence1.shape[0]):
        for j in range(0, confidence1.shape[1]):
            confidence1[i][j] = softmax(confidence1[i][j])

    for i in range(0, confidence2.shape[0]):
        for j in range(0, confidence2.shape[1]):
            confidence2[i][j] = softmax(confidence2[i][j])

    #ensemble confidence scores with weight W
    W = 0.66

    confidence = confidence1 * W + confidence2 * (1 - W)

    #get final prediction
    prediction = [np.argmax(x) - 1 for x in confidence]
    prediction = np.array(prediction)

    print "Prediction\n"
    for i in range(0, prediction.shape[0]):
        if prediction[i] == -1:
            pol = "Negative"
        elif prediction[i] == 0:
            pol = "Neutral"
        else:
            pol = "Positive"
        print "Message : " + messages_test[i] + "Polarity : " + pol + "\n"
コード例 #8
0
def main(f):
    print "System training started"

    #load training dataset
    dataset_train = f
    ids, labels_train, messages_train = tsvreader.opentsv(dataset_train)
    print "Train data loaded"

    #labels for subjectivity detection (2 categories)
    temp_labels_train = [0 if x == "neutral" else 1 for x in labels_train]
    #labels for polarity detection (3 categories)
    labels_train = [
        0 if x == "neutral" else -1 if x == "negative" else 1
        for x in labels_train
    ]

    #convert labels to numpy arrays
    temp_labels_train = np.array(temp_labels_train)
    labels_train = np.array(labels_train)

    #load word clusters
    clusters = loadClusters()
    print "Clusters loaded"

    #load Lexicons
    negationList, slangDictionary, lexicons, mpqa_lexicons = loadLexicons()
    print "Lexicons loaded"

    #tokenize all messages
    tokens_train = tokenize(messages_train)
    print "Messages tokenized"

    #compute pos tags for all messages
    pos_tags_train = arktagger.pos_tag_list(messages_train)
    print "Pos tags computed"

    #compute pos tag bigrams
    pos_bigrams_train = getBigrams(pos_tags_train)
    #compute pos tag trigrams
    pos_trigrams_train = getTrigrams(pos_tags_train)

    #get the unique pos bigrams from training set
    unique_pos_tags = getPosTagsSet(pos_tags_train)
    unique_bigrams = getBigramsSet(pos_bigrams_train)
    unique_trigrams = getTrigramsSet(pos_trigrams_train)

    #compute POS tag scores
    pos_tags_scores_neutral = posTagsScore(unique_pos_tags, 0, pos_tags_train,
                                           labels_train)
    pos_tags_scores_positive = posTagsScore(unique_pos_tags, 1, pos_tags_train,
                                            labels_train)
    pos_tags_scores_negative = posTagsScore(unique_pos_tags, -1,
                                            pos_tags_train, labels_train)

    pos_bigrams_scores_neutral = posBigramsScore(unique_bigrams, 0,
                                                 pos_bigrams_train,
                                                 labels_train)
    pos_bigrams_scores_positive = posBigramsScore(unique_bigrams, 1,
                                                  pos_bigrams_train,
                                                  labels_train)
    pos_bigrams_scores_negative = posBigramsScore(unique_bigrams, -1,
                                                  pos_bigrams_train,
                                                  labels_train)

    pos_trigrams_scores_neutral = posTrigramsScore(unique_trigrams, 0,
                                                   pos_trigrams_train,
                                                   labels_train)
    pos_trigrams_scores_positive = posTrigramsScore(unique_trigrams, 1,
                                                    pos_trigrams_train,
                                                    labels_train)
    pos_trigrams_scores_negative = posTrigramsScore(unique_trigrams, -1,
                                                    pos_trigrams_train,
                                                    labels_train)

    #compute mpqa scores
    mpqaScores = getScores(mpqa_lexicons,
                           messages_train,
                           labels_train,
                           neutral=True)

    #save scores and other resources for future use
    savePosScores(pos_tags_scores_neutral, pos_tags_scores_positive,
                  pos_tags_scores_negative, pos_bigrams_scores_neutral,
                  pos_bigrams_scores_positive, pos_bigrams_scores_negative,
                  pos_trigrams_scores_neutral, pos_trigrams_scores_positive,
                  pos_trigrams_scores_negative, mpqaScores)
    #save lexicons
    saveLexicons(negationList, slangDictionary, lexicons, mpqa_lexicons)
    #save clusters
    saveClusters(clusters)

    #load Glove embeddings
    d = 200
    glove = GloveDictionary.Glove(d)

    #save Glove embeddings for future use
    saveGlove(glove)

    #Subjectivity Detection Features

    #SD1 features
    features_train_1 = features.getFeatures(
        messages_train, tokens_train, pos_tags_train, slangDictionary,
        lexicons, mpqa_lexicons, pos_bigrams_train, pos_trigrams_train,
        pos_bigrams_scores_negative, pos_bigrams_scores_positive,
        pos_trigrams_scores_negative, pos_trigrams_scores_positive,
        pos_tags_scores_negative, pos_tags_scores_positive, mpqaScores,
        negationList, clusters, pos_bigrams_scores_neutral,
        pos_trigrams_scores_neutral, pos_tags_scores_neutral)

    #SD2 features
    features_train_2 = []
    #for message in tokens_train :
    for i in range(0, len(messages_train)):
        features_train_2.append(glove.findCentroid(tokens_train[i]))
    features_train_2 = np.array(features_train_2)

    #regularize features
    features_train_1 = regularization.regularize(features_train_1)
    features_train_2 = regularization.regularizeHorizontally(features_train_2)

    #Penalty parameter C of the error term for every SD system
    C1 = 0.001953125
    C2 = 1.4068830572470667

    #get confidence scores
    train_confidence_1 = getConfidenceScores(features_train_1,
                                             temp_labels_train, C1)
    train_confidence_2 = getConfidenceScores(features_train_2,
                                             temp_labels_train, C2)

    #normalize confidence scores
    softmax = lambda x: 1 / (1. + math.exp(-x))
    train_confidence_1 = [softmax(conf) for conf in train_confidence_1]
    train_confidence_2 = [softmax(conf) for conf in train_confidence_2]

    train_confidence_1 = np.array(train_confidence_1)
    train_confidence_2 = np.array(train_confidence_2)

    #train SD classifiers
    sd1 = SVM.train(features_train_1, temp_labels_train, c=C1, k="linear")
    sd2 = SVM.train(features_train_2, temp_labels_train, c=C2, k="linear")

    #Sentiment Polarity Features (append confidence scores to SD features)

    #SP1 features
    features_train_1 = np.hstack(
        (features_train_1,
         train_confidence_1.reshape(train_confidence_1.shape[0], 1)))
    #SP1 features
    features_train_2 = np.hstack(
        (features_train_2,
         train_confidence_2.reshape(train_confidence_2.shape[0], 1)))

    #Penalty parameter C of the error term for every SP system
    C1 = 0.003410871889693192
    C2 = 7.396183688299606

    #train SP classifiers
    sp1 = SVM.train(features_train_1, labels_train, c=C1, k="linear")
    sp2 = SVM.train(features_train_2, labels_train, c=C2, k="linear")

    #save trained models
    saveModels(sd1, sd2, sp1, sp2)

    print "System training completed!"
コード例 #9
0
def main(f):
	print "System training started"
	
        #load training dataset
	dataset_train = f
	ids,labels_train,messages_train=tsvreader.opentsv(dataset_train)
	print "Train data loaded"
	
	#labels for subjectivity detection (2 categories)
	temp_labels_train = [0 if x=="neutral" else 1 for x in labels_train]
	#labels for polarity detection (3 categories)
	labels_train = [0 if x=="neutral" else -1 if x =="negative" else 1 for x in labels_train]
	
	#convert labels to numpy arrays
	temp_labels_train=np.array(temp_labels_train)
	labels_train=np.array(labels_train)
	
	#load word clusters
	clusters = loadClusters()
	print "Clusters loaded"
	
	#load Lexicons
	negationList, slangDictionary, lexicons, mpqa_lexicons = loadLexicons()
	print "Lexicons loaded"

	#tokenize all messages
	tokens_train = tokenize(messages_train)
	print "Messages tokenized"

	#compute pos tags for all messages
	pos_tags_train = arktagger.pos_tag_list(messages_train)
	print "Pos tags computed"
	
	#compute pos tag bigrams
	pos_bigrams_train = getBigrams(pos_tags_train)
	#compute pos tag trigrams
	pos_trigrams_train = getTrigrams(pos_tags_train)

	#get the unique pos bigrams from training set
	unique_pos_tags = getPosTagsSet(pos_tags_train)
	unique_bigrams = getBigramsSet(pos_bigrams_train)
	unique_trigrams= getTrigramsSet(pos_trigrams_train)

	#compute POS tag scores
	pos_tags_scores_neutral = posTagsScore(unique_pos_tags,0,pos_tags_train,labels_train)
	pos_tags_scores_positive = posTagsScore(unique_pos_tags,1,pos_tags_train,labels_train)
	pos_tags_scores_negative = posTagsScore(unique_pos_tags,-1,pos_tags_train,labels_train)
	   
	pos_bigrams_scores_neutral = posBigramsScore(unique_bigrams,0,pos_bigrams_train,labels_train)
	pos_bigrams_scores_positive = posBigramsScore(unique_bigrams,1,pos_bigrams_train,labels_train)
	pos_bigrams_scores_negative = posBigramsScore(unique_bigrams,-1,pos_bigrams_train,labels_train)

	pos_trigrams_scores_neutral = posTrigramsScore(unique_trigrams,0,pos_trigrams_train,labels_train)
	pos_trigrams_scores_positive = posTrigramsScore(unique_trigrams,1,pos_trigrams_train,labels_train)
	pos_trigrams_scores_negative = posTrigramsScore(unique_trigrams,-1,pos_trigrams_train,labels_train)
	
	#compute mpqa scores
	mpqaScores = getScores(mpqa_lexicons,messages_train,labels_train,neutral=True)
	
	#save scores and other resources for future use
	savePosScores(pos_tags_scores_neutral, pos_tags_scores_positive,pos_tags_scores_negative,pos_bigrams_scores_neutral,pos_bigrams_scores_positive,pos_bigrams_scores_negative,pos_trigrams_scores_neutral,pos_trigrams_scores_positive,pos_trigrams_scores_negative,mpqaScores)
        #save lexicons
	saveLexicons(negationList,slangDictionary,lexicons,mpqa_lexicons)
        #save clusters
	saveClusters(clusters)
	
	#load Glove embeddings
	d = 200
	glove = GloveDictionary.Glove(d)

	#save Glove embeddings for future use
	saveGlove(glove)
	
	#Subjectivity Detection Features
	
	#SD1 features
	features_train_1 = features.getFeatures(messages_train,tokens_train,pos_tags_train,slangDictionary,lexicons,mpqa_lexicons,pos_bigrams_train,pos_trigrams_train,pos_bigrams_scores_negative,pos_bigrams_scores_positive,pos_trigrams_scores_negative,pos_trigrams_scores_positive,pos_tags_scores_negative,pos_tags_scores_positive,mpqaScores,negationList,clusters,pos_bigrams_scores_neutral,pos_trigrams_scores_neutral,pos_tags_scores_neutral)
	
	#SD2 features
	features_train_2 = []
	#for message in tokens_train :
	for i in range(0,len(messages_train)):
		features_train_2.append(glove.findCentroid(tokens_train[i]))
	features_train_2 = np.array(features_train_2)
	
	#regularize features
	features_train_1 = regularization.regularize(features_train_1)
	features_train_2 = regularization.regularizeHorizontally(features_train_2)
	
	#Penalty parameter C of the error term for every SD system
	C1=0.001953125
	C2=1.4068830572470667

	#get confidence scores
	train_confidence_1 = getConfidenceScores(features_train_1, temp_labels_train, C1)
	train_confidence_2 = getConfidenceScores(features_train_2, temp_labels_train, C2)
	
	#normalize confidence scores
	softmax = lambda x: 1 / (1. + math.exp(-x))
	train_confidence_1 = [softmax(conf) for conf in train_confidence_1]
	train_confidence_2 = [softmax(conf) for conf in train_confidence_2]
	
	train_confidence_1 = np.array(train_confidence_1)
	train_confidence_2 = np.array(train_confidence_2)

	#train SD classifiers
	sd1 = SVM.train(features_train_1,temp_labels_train,c=C1,k="linear")
	sd2 = SVM.train(features_train_2,temp_labels_train,c=C2,k="linear")
	
	#Sentiment Polarity Features (append confidence scores to SD features)
	
	#SP1 features
	features_train_1 = np.hstack((features_train_1,train_confidence_1.reshape(train_confidence_1.shape[0],1)))
	#SP1 features
	features_train_2 = np.hstack((features_train_2,train_confidence_2.reshape(train_confidence_2.shape[0],1)))

	#Penalty parameter C of the error term for every SP system
	C1=0.003410871889693192
	C2=7.396183688299606

	#train SP classifiers
	sp1 = SVM.train(features_train_1,labels_train,c=C1,k="linear")
	sp2 = SVM.train(features_train_2,labels_train,c=C2,k="linear")
	
	#save trained models
	saveModels(sd1,sd2,sp1,sp2)
	
	print "System training completed!"
コード例 #10
0
    labels_train = [0 if x=="negative" else 1 for x in labels_train]
    labels_test = [0 if x=="negative" else 1 for x in labels_test]
    
#tokenize all messages
tokens_train = tokenize(messages_train)
tokens_test = tokenize(messages_test)

#initialize glove lexicon
glove = GloveDictionary.Glove()

#dictionary = enchant.Dict("en_US")

#slangDictionary = Slang.Slang()

pos_tags_train = arktagger.pos_tag_list(messages_train)
pos_tags_test = arktagger.pos_tag_list(messages_test)


##messages_train = preprocessMessages(messages_train,tokens_train,pos_tags_train,slangDictionary,dictionary)
##messages_test = preprocessMessages(messages_test,tokens_test,pos_tags_test,slangDictionary,dictionary)
##
##tokens_train=tokenize(messages_train)
##tokens_test=tokenize(messages_test)
##
###compute pos tags for all preprocessed messages
##pos_tags_train = arktagger.pos_tag_list(messages_train)
##pos_tags_test = arktagger.pos_tag_list(messages_test)


print("glove initialized ... " )
コード例 #11
0
	def test(self,dom):

		test_pos = []
		temp_vector = []
		test_vector = []

		chars_to_remove = ['=', '!', '?', ',', '<', '.', '>', '/', ';' ,':', ']', '}', '[', '{', '|', '@', '$', '%', '^', '&', '*', '(', ')', '_', '-', '+', '+', '"','1', '2', '3','4','5','6','7','8','9' ] #removing characters

		reviews = ET.parse(self.test_path).getroot().findall('Review')
		for review in reviews:
			sentences = review[0] #get the sentences
			for sentence in sentences:
				if (len(sentence) > 1):
					opinions = sentence[1]
					
					if ( len(opinions) > 0): #check if there are aspects 
						t = sentence[0].text
						t2 = word_tokenize(t)
						capsList, capsCounter = self.listCaps(t2) #storing the caps words of the text
						text = word_tokenize(t.lower())
						
						for opinion in opinions:
							test_pos.append(t)
							
							#calculate score for each lexicon
							temp0 = self.checkLexiconReady(text, self.stopwords, capsList, self.negation, self.AFINN_lexicon) #Afinn lexicon scores
							temp3 = self.checkLexiconReady(text, self.stopwords, capsList, self.negation, self.huLiu_lexicon) #Hu and Liu lexicon scores
							temp4 = self.checkLexiconReady(text, self.stopwords, capsList, self.negation, self.nrc_lexicon) #NRC lexicon scores
							
							temp1 = self.checkLexiconUni(text, chars_to_remove, self.stopwords, capsList, self.negation, self.train_unigram_lexicon) #unigram lexicon scores from the train data of each domain
							
							temp7 = self.howManyUpperFirst(t2) #num of words starting with capitalized first letter
							temp9 = [self.howMany(text, '?'), self.howMany(text, '!')] #number of question and exclamation marks
							temp11 = self.lastSymbol(t2) #is the last symbol a question or an exclamation mark
													
							cat = opinion.attrib['category'].split('#') #a feature for the entity and the attribute
							cat0 = []
							for ent in self.entities:
								if ent == cat[0]:
									cat0.append(1)
								else:
									cat0.append(0)
							cat1 = []
							for attr in self.attributes:
								if attr == cat[1]:
									cat1.append(1)
								else:
									cat1.append(0)
							temp12 = [len(opinions)] + cat0 + cat1
							
							temp = temp0 + temp1 + temp3 + temp4 + temp7 + temp9 + temp11 + [capsCounter] + temp12
							temp_vector.append(temp) #creating the features vector
		  
		temp_vector = self.normalize(temp_vector) #normalize the vector              
						
		pos = arktagger.pos_tag_list(test_pos) #finding the pos tags            
		test_pos = self.howManyPos(pos)

		test_pos_bi = self.calcScorePosBi(pos)
		test_pos_bi = self.normalize(test_pos_bi)

		for i in range(len(temp_vector)): #join the matrices
			test_vector.append(temp_vector[i] + test_pos[i] + test_pos_bi[i])

		print
		print '---- End of Test ----'
		
		return test_vector