def __init__(self, basePath, stockName, Retrain):
        '''
        Constructor
        :type self:
        '''

        if (basePath == None):
            self.basePath = self.basePath
        else:
            self.basePath = basePath

        self.stockName = stockName
        serializationFile = open(
            os.path.join(self.basePath, 'StockToClassifier.bin'), 'rb')
        self.StockToClassifier = pickle.load(serializationFile)
        #import pdb; pdb.set_trace()
        self.usedClassifier = self.StockToClassifier[self.stockName]
        # Start the DatasetBuilder
        #-------------------------
        # Configurations file xml of the dataset builder
        configFileDatasetBuilder = os.path.join(self.basePath,
                                                "DatasetBuilder",
                                                "Configurations",
                                                "Configurations.xml")

        # The serialization file to save the dataset
        datasetSerializationFile = os.path.join(self.basePath,
                                                "DatasetBuilder", "Output",
                                                "dataset.bin")

        if Retrain == False:
            # The XLSX file name for train set
            xlsxTrainFileName = os.path.join(self.basePath, "DatasetBuilder",
                                             "Input", "train")

            # Initialize the DatasetBuilder from serialization file
            datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [],
                                            datasetSerializationFile)

            datasetBuilder.trainSet = datasetBuilder.GetDatasetFromXLSXFile(
                xlsxTrainFileName)

            self.RunLanguageModel(self.usedClassifier, datasetBuilder.trainSet)

            trainFeaturesExtractor = self.RunFeatureExtractor(
                self.usedClassifier, datasetBuilder.trainSet)
            self.Train(self.usedClassifier, trainFeaturesExtractor, True)
        else:
            # Initialize the DatasetBuilder from serialization file
            datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [],
                                            datasetSerializationFile)
def init_dicts():

	configFileDatasetBuilder = os.path.join('DatasetBuilder','Configurations','Configurations.xml')
					   
	datasetSerializationFile = os.path.join('DatasetBuilder','Output', 'dataset.bin')

	xlsxTrainFileName = os.path.join('DatasetBuilder','Input','sentiment')


	datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile)
	datasetBuilder.trainSet = datasetBuilder.GetSentimentDatasetFromXLSXFile(xlsxTrainFileName).values()

	words_dict = {'negative':[], 'positive': [], 'neutral': []}
	for item in datasetBuilder.trainSet:
		words_dict[item['label']] += item['words']
	for k in words_dict:
		words_dict[k] = list(set(words_dict[k]))
	
	return words_dict
Example #3
0
                                datasetSerializationFile)

# Load the dataset
#datasetBuilder.LoadDataset()

# Update the labels
'''
numFiles = 50
for i in range(numFiles):
	print('Updating labels from file ' + xlsxManualLabelsFileName  + "_" + str(i + 1) + '...')
	datasetBuilder.UpdateManualLabelsFromXLSXFile(xlsxManualLabelsFileName  + "_" + str(i + 1), (i + 1)) # This should be done separately when dataset is manually labeled

# Form or load the train/test sets
datasetBuilder.SplitTrainTest()
'''
datasetBuilder.trainSet = datasetBuilder.GetDatasetFromXLSXFile(
    xlsxTrainFileName)
# Set the dataset to the train set so that the language model is built from train tweets only
datasetBuilder.dataSet = datasetBuilder.GetDatasetFromXLSXFile(
    xlsxTrainFileName)
datasetBuilder.testSet = datasetBuilder.GetDatasetFromXLSXFile(
    xlsxTestFileName)
#datasetBuilder.dataSet.extend(datasetBuilder.GetDatasetFromXLSXFile(xlsxTestFileName))
#datasetBuilder.trainSet.extend(datasetBuilder.GetDatasetFromXLSXFile(xlsxTestFileName))

# Configurations file xml of the language model
configFileLanguageModel = ".\\LanguageModel\\Configurations\\Configurations.xml"
langModelLogFile = ".\\LanguageModel\\Output\\language_model.txt"
langModelTxtLoadFile = ".\\LanguageModel\\Output\\language_model_stocks_mix.txt"
stopWordsFileName = ".\\LanguageModel\\Input\\stop_words.txt"
linksDBFile = ".\\LanguageModel\\Output\\links_database.txt"
# The serialization file to save the model
    def __init__(self):
        '''
        Constructor
        :type self:
        '''
        # Start the DatasetBuilder
        #-------------------------
        # Configurations file xml of the dataset builder
        configFileDatasetBuilder = os.path.join('DatasetBuilder','Configurations','Configurations.xml')
               
        # The serialization file to save the dataset
        datasetSerializationFile = os.path.join('DatasetBuilder','Output', 'dataset.bin')
               
        # The XLSX file name for train set
        xlsxTrainFileName = os.path.join('DatasetBuilder','Input','train')
        
        
        # Initialize the DatasetBuilder from serialization file
        datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile)
        
        datasetBuilder.trainSet = datasetBuilder.GetDatasetFromXLSXFile(xlsxTrainFileName)
                
        
        # Configurations file xml of the language model
        configFileLanguageModel_lexicon = os.path.join('LanguageModel', 'Configurations', 'Configurations-lexicon.xml')
        configFileLanguageModel_Tasi = os.path.join('LanguageModel', 'Configurations', 'Configurations-Tasi.xml')
        stopWordsFileName = os.path.join('LanguageModel', 'Input', 'stop_words.txt')
        linksDBFile = os.path.join('LanguageModel', 'Output', 'links_database.txt')
        # The serialization file to save the model
        languageModelSerializationFile = os.path.join('LanguageModel', 'Output', 'language_model.bin')
        
        # Start the LanguageModel:
        
        # Initialize the LanguageModel_Lexicon
        self.languageModel_lexicon = LanguageModel(configFileLanguageModel_lexicon, stopWordsFileName, languageModelSerializationFile, linksDBFile, datasetBuilder.trainSet)
        self.languageModel_lexicon.BuildLanguageModel()

         # Initialize the LanguageModel_Tasi
        self.languageModel_Tasi = LanguageModel(configFileLanguageModel_Tasi, stopWordsFileName, languageModelSerializationFile, linksDBFile, datasetBuilder.trainSet)
        self.languageModel_Tasi.BuildLanguageModel()
        
        # Configurations file xml of the features extractor
        configFileFeaturesExtractor_Lexicon = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-lexicon.xml')
        configFileFeaturesExtractor_Tasi = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-Tasi.xml')
        # The serialization file to save the features
        trainFeaturesSerializationFile = os.path.join('FeaturesExtractor', 'Output', 'train_features.bin')
        trainLabelsSerializationFile = os.path.join('FeaturesExtractor', 'Output', 'train_labels.bin')
        
        # Start the FeaturesExtractor:
        #-----------------------------    
        # Initialize the FeaturesExtractor _ Lexicon
        trainFeaturesExtractor_Lexicon = FeaturesExtractor(configFileFeaturesExtractor_Lexicon, trainFeaturesSerializationFile, trainLabelsSerializationFile, self.languageModel_lexicon, datasetBuilder.trainSet)
        trainFeaturesExtractor_Lexicon.ExtractNumTfFeatures()

        # Initialize the FeaturesExtractor _ Tasi
        trainFeaturesExtractor_Tasi = FeaturesExtractor(configFileFeaturesExtractor_Tasi, trainFeaturesSerializationFile, trainLabelsSerializationFile, self.languageModel_Tasi, datasetBuilder.trainSet)
        trainFeaturesExtractor_Tasi.ExtractNumTfFeatures()

        # The serialization file to save the features
        configFileClassifier_Lexicon = os.path.join('Classifier', 'Configurations', 'Configurations-lexicon.xml')
        configFileClassifier_Tasi = os.path.join('Classifier', 'Configurations', 'Configurations-Tasi.xml')
        modelSerializationFile = os.path.join('Classifier', 'Output', 'classifier_model.bin')
    
        # Start the Classifier:
        #---------------------
        print(trainFeaturesExtractor_Tasi.labels[:4])
        print([i['label'] for i in trainFeaturesExtractor_Lexicon.dataSet[:4]])
        self.classifier_Lexicon = Classifier(configFileClassifier_Lexicon, modelSerializationFile,  trainFeaturesExtractor_Lexicon.features, trainFeaturesExtractor_Lexicon.labels, [], [])
        self.classifier_Tasi = Classifier(configFileClassifier_Tasi, modelSerializationFile, trainFeaturesExtractor_Tasi.features,
                        trainFeaturesExtractor_Tasi.labels, [],[])
        
        # Train
        self.classifier_Lexicon.Train()
        self.classifier_Tasi.Train()
    def init(cls, save_path, use_backend=True, pre_stocks=None):
        '''
        Constructor
        :type self:
        '''
        global stocks
        if pre_stocks:
            stocks = pre_stocks

        for stock in stocks:
            print('Buildind model for %s' % stock)
            stock_model = {}
            # Start the DatasetBuilder
            #-------------------------
            # Configurations file xml of the dataset builder
            configFileDatasetBuilder = os.path.join('DatasetBuilder',
                                                    'Configurations',
                                                    'Configurations.xml')

            # The serialization file to save the dataset
            datasetSerializationFile = os.path.join('DatasetBuilder', 'Output',
                                                    'dataset.bin')

            # The XLSX file name for train set
            xlsxTrainFileName = os.path.join('DatasetBuilder', 'Input',
                                             'train')

            # Initialize the DatasetBuilder from serialization file
            datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [],
                                            datasetSerializationFile)
            if use_backend:
                datasetBuilder.trainSet = datasetBuilder.GetDatasetFromBackend(
                    stock)
            else:
                datasetBuilder.trainSet = datasetBuilder.GetDatasetFromXLSXFile(
                    xlsxTrainFileName, stock)
            if len(datasetBuilder.trainSet) < NMIN_SET:
                print("Not enough data: ", len(datasetBuilder.trainSet))
                continue
            datasetBuilder.trainSet = datasetBuilder.trainSet[NVALID:]
            # Configurations file xml of the language model
            configFileLanguageModel_lexicon = os.path.join(
                'LanguageModel', 'Configurations', 'Configurations-Tasi.xml')
            stopWordsFileName = os.path.join('LanguageModel', 'Input',
                                             'stop_words.txt')
            linksDBFile = os.path.join('LanguageModel', 'Output',
                                       'links_database.txt')
            # The serialization file to save the model
            languageModelSerializationFile = os.path.join(
                'LanguageModel', 'Output', 'language_model.bin')

            # Start the LanguageModel:

            # Initialize the LanguageModel_Lexicon
            stock_model['languageModel_lexicon'] = LanguageModel(
                configFileLanguageModel_lexicon, stopWordsFileName,
                languageModelSerializationFile, linksDBFile,
                datasetBuilder.trainSet)
            stock_model['languageModel_lexicon'].BuildLanguageModel()

            # Configurations file xml of the features extractor
            configFileFeaturesExtractor_Lexicon = os.path.join(
                'FeaturesExtractor', 'Configurations',
                'Configurations-Tasi.xml')
            # The serialization file to save the features
            trainFeaturesSerializationFile = os.path.join(
                'FeaturesExtractor', 'Output', 'train_features.bin')
            trainLabelsSerializationFile = os.path.join(
                'FeaturesExtractor', 'Output', 'train_labels.bin')

            # Start the FeaturesExtractor:
            #-----------------------------
            # Initialize the FeaturesExtractor _ Lexicon
            trainFeaturesExtractor_Lexicon = FeaturesExtractor(
                configFileFeaturesExtractor_Lexicon,
                trainFeaturesSerializationFile, trainLabelsSerializationFile,
                stock_model['languageModel_lexicon'], datasetBuilder.trainSet)
            trainFeaturesExtractor_Lexicon.ExtractNumTfFeatures(sparse=True)
            #print(trainFeaturesExtractor_Lexicon.features[0])
            # The serialization file to save the features
            configFileClassifier_Lexicon = os.path.join(
                'Classifier', 'Configurations', 'Configurations-Tasi.xml')
            modelSerializationFile = os.path.join('Classifier', 'Output',
                                                  'classifier_model.bin')

            # Start the Classifier:
            #---------------------
            stock_model['classifier_Lexicon'] = Classifier(
                configFileClassifier_Lexicon, modelSerializationFile,
                trainFeaturesExtractor_Lexicon.sparse_features,
                trainFeaturesExtractor_Lexicon.labels, [], [])
            #stock_model['classifier_Lexicon'] = Classifier(configFileClassifier_Lexicon, modelSerializationFile,  trainFeaturesExtractor_Lexicon.features, trainFeaturesExtractor_Lexicon.labels, [], [])
            #print(trainFeaturesExtractor_Lexicon.labels[:4])
            #print([i['label'] for i in trainFeaturesExtractor_Lexicon.dataSet[:4]])
            # Train
            stock_model['classifier_Lexicon'].Train()
            stock_model['training_samples'] = len(datasetBuilder.trainSet)
            cls.save(save_path, stock, stock_model)

            print("----------------------------------------------------")