def Classify(self, text, stockName):
	
        testSet = []
        for t in text:
            testSet.append({'label' : '', 'text' : t})

        if stockName == 'Tasi':
            # Configurations file xml of the features extractor
            configFileFeaturesExtractor = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-Tasi.xml')
            testFeaturesExtractor = FeaturesExtractor(configFileFeaturesExtractor, None, None, self.languageModel_Tasi, testSet)
            testFeaturesExtractor.ExtractNumTfFeatures()
            self.classifier_Tasi.testFeatures = testFeaturesExtractor.features
            self.classifier_Tasi.testTargets = []
            for i in range(len(self.classifier_Tasi.testFeatures)):		
            	#self.classifier_Tasi.testTargets[i] = 1
                self.classifier_Tasi.testTargets.append(1)
            label, acc, val = self.classifier_Tasi.Test()
        else:
            configFileFeaturesExtractor = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-lexicon.xml')
            testFeaturesExtractor = FeaturesExtractor(configFileFeaturesExtractor, None, None, self.languageModel_lexicon, testSet)
            self.classifier_Lexicon.testFeatures = testFeaturesExtractor.features
            self.classifier_Lexicon.testTargets = []
            for i in range(len(self.classifier_Lexicon.testFeatures)):		
                self.classifier_Lexicon.testTargets[i] = 1
            label, acc, val = self.classifier_Lexicon.Test()

        
        return label
예제 #2
0
    def prepare_data(self, dataset):
        trainFeaturesExtractor = FeaturesExtractor(
            self.configFileFeaturesExtractor,
            self.trainFeaturesSerializationFile,
            self.trainLabelsSerializationFile,
            self.languageModel,
            dataset,
            sentiment_features=True)
        trainFeaturesExtractor.ExtractNumTfFeatures(init_dicts())

        maxid = max([max(i.keys()) for i in trainFeaturesExtractor.features])

        X = []
        Y = []

        for i, item in enumerate(trainFeaturesExtractor.features):
            itemx = [0 for _ in range(maxid)]
            l = [0, 0, 0]
            l[trainFeaturesExtractor.labels[i] - 1] = 1

            for j in trainFeaturesExtractor.features[i]:
                v = trainFeaturesExtractor.features[i][j]
                itemx[j - 1] = v

            X.append(itemx)
            Y.append(trainFeaturesExtractor.labels[i])
        trainFeaturesExtractor.dataset = []
        trainFeaturesExtractor.features = []
        trainFeaturesExtractor.labels = []
        return X, Y
예제 #3
0
    def RunFeatureExtractor(self, _Classifier, trainSet):
        # Configurations file xml of the features extractor
        configFileFeaturesExtractor = os.path.join(
            self.basePath, "FeaturesExtractor", "Configurations",
            "Configurations-" + _Classifier + ".xml")
        # The serialization file to save the features
        trainFeaturesSerializationFile = os.path.join(self.basePath,
                                                      "FeaturesExtractor",
                                                      "Output",
                                                      "train_features.bin")
        trainLabelsSerializationFile = os.path.join(self.basePath,
                                                    "FeaturesExtractor",
                                                    "Output",
                                                    "train_labels.bin")

        # Start the FeaturesExtractor:
        #-----------------------------
        # Initialize the FeaturesExtractor
        trainFeaturesExtractor = FeaturesExtractor(
            configFileFeaturesExtractor, trainFeaturesSerializationFile,
            trainLabelsSerializationFile, self.languageModel, trainSet)
        if _Classifier == "Lexicon":
            trainFeaturesExtractor.ExtractLexiconFeatures()
        else:
            trainFeaturesExtractor.ExtractNumTfFeatures()
        return trainFeaturesExtractor
    def prepare_data(self, dataset):
        trainFeaturesExtractor = FeaturesExtractor(
            self.configFileFeaturesExtractor,
            self.trainFeaturesSerializationFile,
            self.trainLabelsSerializationFile,
            self.languageModel,
            dataset,
            questions_features=True)

        print("Data length: ", len(dataset))

        words_dict = self.datasetBuilder.getQuestionsDatasetDictionary(
            self.words_dict_path)
        trainFeaturesExtractor.ExtractNumTfFeatures(questions_dict=words_dict)

        maxid = max([max(i.keys()) for i in trainFeaturesExtractor.features])

        X = []
        Y = []
        L = len(dataset)

        for i, item in enumerate(trainFeaturesExtractor.features):
            itemx = [0 for _ in range(maxid)]
            l = [0, 0, 0]
            l[trainFeaturesExtractor.labels[i] - 1] = 1

            for j in trainFeaturesExtractor.features[i]:
                v = trainFeaturesExtractor.features[i][j]
                itemx[j - 1] = v

            X.append(itemx)
            Y.append(trainFeaturesExtractor.labels[i])

        return X, Y, L
    def evaluate(cls, path, use_backend=True, pre_stocks=None):
        validation_accuracy = {}
        global stocks
        if pre_stocks:
            stocks = pre_stocks

        for stockName in stocks:
            model = cls.load(path, stockName)
            if not model:
                continue

            configFileDatasetBuilder = os.path.join('DatasetBuilder',
                                                    'Configurations',
                                                    'Configurations.xml')

            # The serialization file to save the dataset
            datasetSerializationFile = os.path.join('DatasetBuilder', 'Output',
                                                    'dataset.bin')

            # The XLSX file name for train set
            xlsxTrainFileName = os.path.join('DatasetBuilder', 'Input',
                                             'train')

            # Initialize the DatasetBuilder from serialization file
            datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [],
                                            datasetSerializationFile)
            if use_backend:
                testSet = datasetBuilder.GetDatasetFromBackend(stockName)
            else:
                testSet = datasetBuilder.GetDatasetFromXLSXFile(
                    xlsxTrainFileName, stockName)

            if len(testSet) < NMIN_SET:
                continue

            testSet = testSet[:NVALID]

            print('Using model for %s' % stockName)
            configFileFeaturesExtractor = os.path.join(
                'FeaturesExtractor', 'Configurations',
                'Configurations-Tasi.xml')
            testFeaturesExtractor = FeaturesExtractor(
                configFileFeaturesExtractor, None, None,
                model['languageModel_lexicon'], testSet)
            #testFeaturesExtractor.ExtractLexiconFeatures()
            testFeaturesExtractor.ExtractNumTfFeatures(sparse=True)
            model[
                'classifier_Lexicon'].testFeatures = testFeaturesExtractor.sparse_features
            model[
                'classifier_Lexicon'].testTargets = testFeaturesExtractor.labels
            label, acc, val = model['classifier_Lexicon'].Test()
            print(acc, val)
            validation_accuracy[stockName] = {
                'accuracy': acc,
                'training_samples': model['training_samples']
            }
        return validation_accuracy
	def prepare_data(self, dataset):
		trainFeaturesExtractor = FeaturesExtractor(self.configFileFeaturesExtractor, self.trainFeaturesSerializationFile, 
													self.trainLabelsSerializationFile, self.languageModel, dataset, 
													sentiment_features=True)

		trainFeaturesExtractor.ExtractNumTfFeatures(sentiment_dict=init_dicts(), sparse=True)


		X= trainFeaturesExtractor.sparse_features
		Y = np.array(trainFeaturesExtractor.labels)

		
		trainFeaturesExtractor.dataset = []
		trainFeaturesExtractor.features = []
		trainFeaturesExtractor.labels = []
		return X, Y
    def classify(cls, text, stockName, path):
        model = cls.load(path, stockName)
        if not model:
            return Exception('Stock wasn\'t found')

        testSet = []
        for t in text:
            testSet.append({'label': '', 'text': t})

        print('Using model for %s' % stockName)
        configFileFeaturesExtractor = os.path.join('FeaturesExtractor',
                                                   'Configurations',
                                                   'Configurations-Tasi.xml')
        testFeaturesExtractor = FeaturesExtractor(
            configFileFeaturesExtractor, None, None,
            model['languageModel_lexicon'], testSet)
        #testFeaturesExtractor.ExtractLexiconFeatures()
        testFeaturesExtractor.ExtractNumTfFeatures(sparse=True)
        model[
            'classifier_Lexicon'].testFeatures = testFeaturesExtractor.sparse_features
        model['classifier_Lexicon'].testTargets = []
        for i in range(model['classifier_Lexicon'].testFeatures.shape[0]):
            model['classifier_Lexicon'].testTargets.append(1)
        probs = model['classifier_Lexicon'].predict_propa()
        label = []
        for prob in probs:
            print(prob)
            prob = prob[0]
            if abs(prob[0] - prob[1]) < 0.2:
                label.append(0)
            elif max(prob) < 0.4:
                label.append(0)
            elif prob[0] > prob[1]:
                label.append(1)
            elif prob[0] <= prob[1]:
                label.append(2)
        return label
예제 #8
0
    def Classify(self, text):

        testSet = []
        for t in text:
            testSet.append({'label': '', 'text': t})

        # Configurations file xml of the features extractor
        configFileFeaturesExtractor = os.path.join(
            self.basePath, "FeaturesExtractor", "Configurations",
            "Configurations-" + self.usedClassifier + ".xml")
        testFeaturesExtractor = FeaturesExtractor(configFileFeaturesExtractor,
                                                  None, None,
                                                  self.languageModel, testSet)
        if self.usedClassifier == "Lexicon":
            testFeaturesExtractor.ExtractLexiconFeatures()
        else:
            testFeaturesExtractor.ExtractNumTfFeatures()
        self.classifier.testFeatures = testFeaturesExtractor.features
        self.classifier.testTargets = []
        for i in range(len(self.classifier.testFeatures)):
            self.classifier.testTargets.append(1)
        label, acc, val = self.classifier.Test()

        return label
예제 #9
0
trainLabelsSerializationFile = ".\\FeaturesExtractor\\Output\\train_labels.bin"
testFeaturesSerializationFile = ".\\FeaturesExtractor\\Output\\test_features.bin"
testLabelsSerializationFile = ".\\FeaturesExtractor\\Output\\test_labels.bin"
testExportFileName = ".\\FeaturesExtractor\\Output\\test_data.txt"
trainExportFileName = ".\\FeaturesExtractor\\Output\\train_data.txt"

# Start the FeaturesExtractor:
#-----------------------------
# Initialize the FeaturesExtractor
trainFeaturesExtractor = FeaturesExtractor(configFileFeaturesExtractor,
                                           trainFeaturesSerializationFile,
                                           trainLabelsSerializationFile,
                                           languageModel,
                                           datasetBuilder.trainSet)
#trainFeaturesExtractor.ExtractTFFeatures()
trainFeaturesExtractor.ExtractNumTfFeatures()
#trainFeaturesExtractor.ExtractTFIDFFeatures()
#trainFeaturesExtractor.ExtractKLFeatures()
#trainFeaturesExtractor.SaveFeatures()
#trainFeaturesExtractor.SaveLabels()
#trainFeaturesExtractor.DumpFeaturesToTxt(trainExportFileName)

#testFeaturesExtractor = FeaturesExtractor(configFileFeaturesExtractor, testFeaturesSerializationFile, testLabelsSerializationFile, languageModel, datasetBuilder.testSet)
#testFeaturesExtractor.ExtractTFFeatures()
#testFeaturesExtractor.ExtractTFIDFFeatures()
#testFeaturesExtractor.ExtractNumTfFeatures()
#testFeaturesExtractor.ExtractKLFeatures()
#testFeaturesExtractor.SaveFeatures()
#testFeaturesExtractor.SaveLabels()
#testFeaturesExtractor.DumpFeaturesToTxt(testExportFileName)
    def __init__(self):
        '''
        Constructor
        :type self:
        '''
        # Start the DatasetBuilder
        #-------------------------
        # Configurations file xml of the dataset builder
        configFileDatasetBuilder = os.path.join('DatasetBuilder','Configurations','Configurations.xml')
               
        # The serialization file to save the dataset
        datasetSerializationFile = os.path.join('DatasetBuilder','Output', 'dataset.bin')
               
        # The XLSX file name for train set
        xlsxTrainFileName = os.path.join('DatasetBuilder','Input','train')
        
        
        # Initialize the DatasetBuilder from serialization file
        datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile)
        
        datasetBuilder.trainSet = datasetBuilder.GetDatasetFromXLSXFile(xlsxTrainFileName)
                
        
        # Configurations file xml of the language model
        configFileLanguageModel_lexicon = os.path.join('LanguageModel', 'Configurations', 'Configurations-lexicon.xml')
        configFileLanguageModel_Tasi = os.path.join('LanguageModel', 'Configurations', 'Configurations-Tasi.xml')
        stopWordsFileName = os.path.join('LanguageModel', 'Input', 'stop_words.txt')
        linksDBFile = os.path.join('LanguageModel', 'Output', 'links_database.txt')
        # The serialization file to save the model
        languageModelSerializationFile = os.path.join('LanguageModel', 'Output', 'language_model.bin')
        
        # Start the LanguageModel:
        
        # Initialize the LanguageModel_Lexicon
        self.languageModel_lexicon = LanguageModel(configFileLanguageModel_lexicon, stopWordsFileName, languageModelSerializationFile, linksDBFile, datasetBuilder.trainSet)
        self.languageModel_lexicon.BuildLanguageModel()

         # Initialize the LanguageModel_Tasi
        self.languageModel_Tasi = LanguageModel(configFileLanguageModel_Tasi, stopWordsFileName, languageModelSerializationFile, linksDBFile, datasetBuilder.trainSet)
        self.languageModel_Tasi.BuildLanguageModel()
        
        # Configurations file xml of the features extractor
        configFileFeaturesExtractor_Lexicon = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-lexicon.xml')
        configFileFeaturesExtractor_Tasi = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-Tasi.xml')
        # The serialization file to save the features
        trainFeaturesSerializationFile = os.path.join('FeaturesExtractor', 'Output', 'train_features.bin')
        trainLabelsSerializationFile = os.path.join('FeaturesExtractor', 'Output', 'train_labels.bin')
        
        # Start the FeaturesExtractor:
        #-----------------------------    
        # Initialize the FeaturesExtractor _ Lexicon
        trainFeaturesExtractor_Lexicon = FeaturesExtractor(configFileFeaturesExtractor_Lexicon, trainFeaturesSerializationFile, trainLabelsSerializationFile, self.languageModel_lexicon, datasetBuilder.trainSet)
        trainFeaturesExtractor_Lexicon.ExtractNumTfFeatures()

        # Initialize the FeaturesExtractor _ Tasi
        trainFeaturesExtractor_Tasi = FeaturesExtractor(configFileFeaturesExtractor_Tasi, trainFeaturesSerializationFile, trainLabelsSerializationFile, self.languageModel_Tasi, datasetBuilder.trainSet)
        trainFeaturesExtractor_Tasi.ExtractNumTfFeatures()

        # The serialization file to save the features
        configFileClassifier_Lexicon = os.path.join('Classifier', 'Configurations', 'Configurations-lexicon.xml')
        configFileClassifier_Tasi = os.path.join('Classifier', 'Configurations', 'Configurations-Tasi.xml')
        modelSerializationFile = os.path.join('Classifier', 'Output', 'classifier_model.bin')
    
        # Start the Classifier:
        #---------------------
        print(trainFeaturesExtractor_Tasi.labels[:4])
        print([i['label'] for i in trainFeaturesExtractor_Lexicon.dataSet[:4]])
        self.classifier_Lexicon = Classifier(configFileClassifier_Lexicon, modelSerializationFile,  trainFeaturesExtractor_Lexicon.features, trainFeaturesExtractor_Lexicon.labels, [], [])
        self.classifier_Tasi = Classifier(configFileClassifier_Tasi, modelSerializationFile, trainFeaturesExtractor_Tasi.features,
                        trainFeaturesExtractor_Tasi.labels, [],[])
        
        # Train
        self.classifier_Lexicon.Train()
        self.classifier_Tasi.Train()
    def init(cls, save_path, use_backend=True, pre_stocks=None):
        '''
        Constructor
        :type self:
        '''
        global stocks
        if pre_stocks:
            stocks = pre_stocks

        for stock in stocks:
            print('Buildind model for %s' % stock)
            stock_model = {}
            # Start the DatasetBuilder
            #-------------------------
            # Configurations file xml of the dataset builder
            configFileDatasetBuilder = os.path.join('DatasetBuilder',
                                                    'Configurations',
                                                    'Configurations.xml')

            # The serialization file to save the dataset
            datasetSerializationFile = os.path.join('DatasetBuilder', 'Output',
                                                    'dataset.bin')

            # The XLSX file name for train set
            xlsxTrainFileName = os.path.join('DatasetBuilder', 'Input',
                                             'train')

            # Initialize the DatasetBuilder from serialization file
            datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [],
                                            datasetSerializationFile)
            if use_backend:
                datasetBuilder.trainSet = datasetBuilder.GetDatasetFromBackend(
                    stock)
            else:
                datasetBuilder.trainSet = datasetBuilder.GetDatasetFromXLSXFile(
                    xlsxTrainFileName, stock)
            if len(datasetBuilder.trainSet) < NMIN_SET:
                print("Not enough data: ", len(datasetBuilder.trainSet))
                continue
            datasetBuilder.trainSet = datasetBuilder.trainSet[NVALID:]
            # Configurations file xml of the language model
            configFileLanguageModel_lexicon = os.path.join(
                'LanguageModel', 'Configurations', 'Configurations-Tasi.xml')
            stopWordsFileName = os.path.join('LanguageModel', 'Input',
                                             'stop_words.txt')
            linksDBFile = os.path.join('LanguageModel', 'Output',
                                       'links_database.txt')
            # The serialization file to save the model
            languageModelSerializationFile = os.path.join(
                'LanguageModel', 'Output', 'language_model.bin')

            # Start the LanguageModel:

            # Initialize the LanguageModel_Lexicon
            stock_model['languageModel_lexicon'] = LanguageModel(
                configFileLanguageModel_lexicon, stopWordsFileName,
                languageModelSerializationFile, linksDBFile,
                datasetBuilder.trainSet)
            stock_model['languageModel_lexicon'].BuildLanguageModel()

            # Configurations file xml of the features extractor
            configFileFeaturesExtractor_Lexicon = os.path.join(
                'FeaturesExtractor', 'Configurations',
                'Configurations-Tasi.xml')
            # The serialization file to save the features
            trainFeaturesSerializationFile = os.path.join(
                'FeaturesExtractor', 'Output', 'train_features.bin')
            trainLabelsSerializationFile = os.path.join(
                'FeaturesExtractor', 'Output', 'train_labels.bin')

            # Start the FeaturesExtractor:
            #-----------------------------
            # Initialize the FeaturesExtractor _ Lexicon
            trainFeaturesExtractor_Lexicon = FeaturesExtractor(
                configFileFeaturesExtractor_Lexicon,
                trainFeaturesSerializationFile, trainLabelsSerializationFile,
                stock_model['languageModel_lexicon'], datasetBuilder.trainSet)
            trainFeaturesExtractor_Lexicon.ExtractNumTfFeatures(sparse=True)
            #print(trainFeaturesExtractor_Lexicon.features[0])
            # The serialization file to save the features
            configFileClassifier_Lexicon = os.path.join(
                'Classifier', 'Configurations', 'Configurations-Tasi.xml')
            modelSerializationFile = os.path.join('Classifier', 'Output',
                                                  'classifier_model.bin')

            # Start the Classifier:
            #---------------------
            stock_model['classifier_Lexicon'] = Classifier(
                configFileClassifier_Lexicon, modelSerializationFile,
                trainFeaturesExtractor_Lexicon.sparse_features,
                trainFeaturesExtractor_Lexicon.labels, [], [])
            #stock_model['classifier_Lexicon'] = Classifier(configFileClassifier_Lexicon, modelSerializationFile,  trainFeaturesExtractor_Lexicon.features, trainFeaturesExtractor_Lexicon.labels, [], [])
            #print(trainFeaturesExtractor_Lexicon.labels[:4])
            #print([i['label'] for i in trainFeaturesExtractor_Lexicon.dataSet[:4]])
            # Train
            stock_model['classifier_Lexicon'].Train()
            stock_model['training_samples'] = len(datasetBuilder.trainSet)
            cls.save(save_path, stock, stock_model)

            print("----------------------------------------------------")