def init_dicts():

	configFileDatasetBuilder = os.path.join('DatasetBuilder','Configurations','Configurations.xml')
					   
	datasetSerializationFile = os.path.join('DatasetBuilder','Output', 'dataset.bin')

	xlsxTrainFileName = os.path.join('DatasetBuilder','Input','sentiment')


	datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile)
	datasetBuilder.trainSet = datasetBuilder.GetSentimentDatasetFromXLSXFile(xlsxTrainFileName).values()

	words_dict = {'negative':[], 'positive': [], 'neutral': []}
	for item in datasetBuilder.trainSet:
		words_dict[item['label']] += item['words']
	for k in words_dict:
		words_dict[k] = list(set(words_dict[k]))
	
	return words_dict
Ejemplo n.º 2
0
class SentimentModel(object):
    def __init__(self, modeln=1):

        self.modeln = modeln
        configFileLanguageModel = os.path.join('LanguageModel',
                                               'Configurations',
                                               'Configurations_sentiment.xml')
        stopWordsFileName = os.path.join('LanguageModel', 'Input',
                                         'stop_words.txt')
        linksDBFile = os.path.join('LanguageModel', 'Output',
                                   'links_database.txt')
        languageModelSerializationFile = os.path.join('LanguageModel',
                                                      'Output',
                                                      'language_model.bin')

        self.languageModel = LanguageModel(configFileLanguageModel,
                                           stopWordsFileName,
                                           languageModelSerializationFile,
                                           linksDBFile, [])

        self.configFileFeaturesExtractor = os.path.join(
            'FeaturesExtractor', 'Configurations',
            'Configurations_sentiment.xml')
        self.trainFeaturesSerializationFile = os.path.join(
            'FeaturesExtractor', 'Output', 'train_features.bin')
        self.trainLabelsSerializationFile = os.path.join(
            'FeaturesExtractor', 'Output', 'train_labels.bin')

    def get_data(self, backend=True):
        configFileDatasetBuilder = os.path.join('DatasetBuilder',
                                                'Configurations',
                                                'Configurations.xml')
        datasetSerializationFile = os.path.join('DatasetBuilder', 'Output',
                                                'dataset.bin')

        self.datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [],
                                             datasetSerializationFile)
        dataset = None

        #if not backend:
        xlsxTrainFileName = os.path.join('DatasetBuilder', 'Input',
                                         'sentiment')
        dataset = self.datasetBuilder.GetSentimentDatasetFromXLSXFile(
            xlsxTrainFileName)
        if backend:
            dataset2 = self.datasetBuilder.GetSentimentDatasetFromBackend()
            for item in dataset2:
                dataset[item] = dataset2[item]
        dataset = list(dataset.values())
        if len(dataset) < MIN_DATA:
            return
        print("Data length: ", len(dataset))
        self.languageModel.dataset = dataset
        self.languageModel.totalNumberOfDocs = len(dataset)
        self.languageModel.BuildLanguageModel()
        self.languageModel.dataset = []

        return dataset

    def prepare_data(self, dataset):
        trainFeaturesExtractor = FeaturesExtractor(
            self.configFileFeaturesExtractor,
            self.trainFeaturesSerializationFile,
            self.trainLabelsSerializationFile,
            self.languageModel,
            dataset,
            sentiment_features=True)
        trainFeaturesExtractor.ExtractNumTfFeatures(init_dicts())

        maxid = max([max(i.keys()) for i in trainFeaturesExtractor.features])

        X = []
        Y = []

        for i, item in enumerate(trainFeaturesExtractor.features):
            itemx = [0 for _ in range(maxid)]
            l = [0, 0, 0]
            l[trainFeaturesExtractor.labels[i] - 1] = 1

            for j in trainFeaturesExtractor.features[i]:
                v = trainFeaturesExtractor.features[i][j]
                itemx[j - 1] = v

            X.append(itemx)
            Y.append(trainFeaturesExtractor.labels[i])
        trainFeaturesExtractor.dataset = []
        trainFeaturesExtractor.features = []
        trainFeaturesExtractor.labels = []
        return X, Y

    def transform_data(self, X, Y):
        X = np.array(X)
        Y = np.array(Y)

        ri = range(X.shape[0])
        rl = range(X.shape[1])

        d = pd.DataFrame(X, index=ri, columns=rl)

        d['class'] = Y

        return d

    def split_data(self, d):
        training_indices, testing_indices = train_test_split(
            d.index,
            stratify=d['class'].values,
            train_size=0.75,
            test_size=0.25)
        return training_indices, testing_indices

    def train(self, backend=True):
        rawdata = self.get_data(backend)
        dxy = self.prepare_data(rawdata)
        d = self.transform_data(dxy[0], dxy[1])
        self.training_indices, self.testing_indices = self.split_data(d)

        X = d.loc[self.training_indices].drop('class', axis=1).values
        Y = d.loc[self.training_indices, 'class'].values
        Xtest = d.loc[self.testing_indices].drop('class', axis=1).values
        Ytest = d.loc[self.testing_indices, 'class'].values
        if self.modeln == 1:
            print(self.fit_model1(X, Y))
            print(self.evaluate_model1(Xtest, Ytest))
        if self.modeln == 2:
            print(self.fit_model2(X, Y))
            print(self.evaluate_model2(Xtest, Ytest))
        if self.modeln == 3:
            print(self.fit_model3(X, Y))
            print(self.evaluate_model3(Xtest, Ytest))
        if self.modeln == 4:
            print(self.fit_model4(X, Y))
            print(self.evaluate_model4(Xtest, Ytest))

    def fit_model1(self, X, Y):
        self.model1 = LinearSVC(C=0.01,
                                penalty="l1",
                                dual=False,
                                random_state=42)
        self.model1.fit(X, Y)
        recall = self.model1.score(X, Y)
        return recall

    def evaluate_model1(self, X, Y):
        evaluation = self.model1.score(X, Y)
        return evaluation

    def fit_model2(self, X, Y):
        self.model1 = LinearSVC(C=0.18,
                                penalty="l1",
                                dual=False,
                                random_state=42)
        self.model1.fit(X, Y)
        recall = self.model1.score(X, Y)
        return recall

    def evaluate_model2(self, X, Y):
        evaluation = self.model1.score(X, Y)
        return evaluation

    def fit_model3(self, X, Y):
        pre_recall = 0.0
        for g in [0.01, 0.05, 0.1, 0.3, 0.5]:
            model = SVC(C=0.18, gamma=g, random_state=42)
            model.fit(X, Y)
            recall = model.score(X, Y)
            print(recall)
            if recall > pre_recall:
                pre_recall = recall
                self.model1 = model
        return recall

    def evaluate_model3(self, X, Y):
        evaluation = self.model1.score(X, Y)
        return evaluation

    def fit_model4(self, X, Y):
        model = SVC(C=0.18, gamma=0.1, random_state=42)
        model.fit(X, Y)
        recall = model.score(X, Y)
        self.model1 = model
        return recall

    def evaluate_model4(self, X, Y):
        evaluation = self.model1.score(X, Y)
        return evaluation

    def classify(self, tweets):
        dataset = []
        for tw in tweets:
            dataset.append({'text': tw, 'label': 'neutral'})
        X, Y = self.prepare_data(dataset)
        return self.model1.predict(X)

    @classmethod
    def load(cls, path):
        return pickle.load(open(path, 'rb'))

    def save(self, path):
        return pickle.dump(self, open(path, 'wb'))
class SentimentModel(object):
	def __init__(self, modeln = 1):

		self.modeln = modeln
		configFileLanguageModel = os.path.join('LanguageModel', 'Configurations', 'Configurations_sentiment.xml')
		stopWordsFileName = os.path.join('LanguageModel', 'Input', 'stop_words.txt')
		linksDBFile = os.path.join('LanguageModel', 'Output', 'links_database.txt')
		languageModelSerializationFile = os.path.join('LanguageModel', 'Output', 'language_model.bin')


		self.languageModel = LanguageModel(configFileLanguageModel, stopWordsFileName, languageModelSerializationFile, linksDBFile, [])
		

		self.configFileFeaturesExtractor = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations_sentiment.xml')
		self.trainFeaturesSerializationFile = os.path.join('FeaturesExtractor', 'Output', 'train_features.bin')
		self.trainLabelsSerializationFile = os.path.join('FeaturesExtractor', 'Output', 'train_labels.bin')


	def get_data(self, backend=True):
		configFileDatasetBuilder = os.path.join('DatasetBuilder','Configurations','Configurations.xml')
		datasetSerializationFile = os.path.join('DatasetBuilder','Output', 'dataset.bin')

		self.datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile)
		dataset = None

		#if not backend:
		xlsxTrainFileName = os.path.join('DatasetBuilder','Input','sentiment')
		dataset = self.datasetBuilder.GetSentimentDatasetFromXLSXFile(xlsxTrainFileName)
		if backend:
			dataset2 = self.datasetBuilder.GetSentimentDatasetFromBackend()
			for item in dataset2:
				dataset[item] = dataset2[item]
		dataset = list(dataset.values())
		if len(dataset) < MIN_DATA:
			return
		print("Data length: ", len(dataset))
		self.languageModel.dataset = dataset
		self.languageModel.totalNumberOfDocs = len(dataset)
		self.languageModel.BuildLanguageModel()
		self.languageModel.dataset = []
		

		return dataset


	def prepare_data(self, dataset):
		trainFeaturesExtractor = FeaturesExtractor(self.configFileFeaturesExtractor, self.trainFeaturesSerializationFile, 
													self.trainLabelsSerializationFile, self.languageModel, dataset, 
													sentiment_features=True)

		trainFeaturesExtractor.ExtractNumTfFeatures(sentiment_dict=init_dicts(), sparse=True)


		X= trainFeaturesExtractor.sparse_features
		Y = np.array(trainFeaturesExtractor.labels)

		
		trainFeaturesExtractor.dataset = []
		trainFeaturesExtractor.features = []
		trainFeaturesExtractor.labels = []
		return X, Y


	def transform_data(self, X, Y):
		X = np.array(X)
		Y = np.array(Y)

		ri = range(X.shape[0])
		rl = range(X.shape[1])

		d = pd.DataFrame(X, index=ri, columns=rl)

		d['class'] = Y

		return d


	def split_data(self, X, Y):
		training_indices, testing_indices = train_test_split(range(X.shape[0]), stratify = Y, train_size=0.75, test_size=0.25)
		self.ntraining_samples = len(training_indices)
		return training_indices, testing_indices


	def train(self, backend=True):
		rawdata = self.get_data(backend)
		Xall, Yall = self.prepare_data(rawdata)
		self.training_indices, self.testing_indices = self.split_data(Xall, Yall)

		X = Xall[self.training_indices]
		Y = Yall[self.training_indices]
		Xtest = Xall[self.testing_indices]
		Ytest = Yall[self.testing_indices]
		acc = 0.0
		if self.modeln == 1:
			print(self.fit_model1(X, Y))
			acc = self.evaluate_model1(Xtest, Ytest)
		if self.modeln == 2:
			print(self.fit_model2(X, Y))
			acc = self.evaluate_model2(Xtest, Ytest)
		if self.modeln == 3:
			print(self.fit_model3(X, Y))
			acc = self.evaluate_model3(Xtest, Ytest)
		if self.modeln == 4:
			print(self.fit_model4(X, Y))
			acc = self.evaluate_model4(Xtest, Ytest)
		if self.modeln == 5:
			print(self.fit_model5(X, Y))
			acc = self.evaluate_model5(Xtest, Ytest)
		result = {'accuracy': acc, 'training_samples': self.ntraining_samples}
		return result

	def fit_model1(self, X, Y):
		self.model1 = LinearSVC(C=0.018, dual=False, random_state=42)
		self.model1.fit(X, Y)
		recall = self.model1.score(X, Y)
		return recall


	def evaluate_model1(self, X, Y):
		evaluation = self.model1.score(X, Y)
		return evaluation


	def fit_model2(self, X, Y):
		self.model1 = LinearSVC(C=0.18, penalty="l1", dual=False, random_state=42)
		self.model1.fit(X, Y)
		recall = self.model1.score(X, Y)
		return recall


	def evaluate_model2(self, X, Y):
		evaluation = self.model1.score(X, Y)
		return evaluation


	def fit_model3(self, X, Y):
		pre_recall = 0.0
		for g in [0.01, 0.05, 0.1, 0.3, 0.5]:
			model = SVC(C=0.18, gamma=g, random_state=42)
			model.fit(X, Y)
			recall = model.score(X, Y)
			print(recall)
			if recall > pre_recall:
				pre_recall = recall
				self.model1 = model
		return recall


	def evaluate_model3(self, X, Y):
		evaluation = self.model1.score(X, Y)
		return evaluation

	def fit_model4(self, X, Y):
		model = SVC(C=0.18, gamma=0.1, random_state=42)
		model.fit(X, Y)
		recall = model.score(X, Y)
		self.model1 = model
		return recall


	def evaluate_model4(self, X, Y):
		evaluation = self.model1.score(X, Y)
		return evaluation

	def fit_model5(self, X, Y):
		model = LogisticRegression(C=0.18, random_state=42)
		model.fit(X, Y)
		recall = model.score(X, Y)
		self.model1 = model
		return recall


	def evaluate_model5(self, X, Y):
		evaluation = self.model1.score(X, Y)
		return evaluation

	def classify(self, tweets):
		dataset = []
		for tw in  tweets:
			dataset.append({'text': tw, 'label':'neutral'})
		X, Y = self.prepare_data(dataset)
		return self.model1.predict(X)


	@classmethod
	def load(cls, path):
		return pickle.load(open(path, 'rb'))


	
	def save(self, path):
		return pickle.dump(self, open(path, 'wb'))