def Classify(self, text, stockName): testSet = [] for t in text: testSet.append({'label' : '', 'text' : t}) if stockName == 'Tasi': # Configurations file xml of the features extractor configFileFeaturesExtractor = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-Tasi.xml') testFeaturesExtractor = FeaturesExtractor(configFileFeaturesExtractor, None, None, self.languageModel_Tasi, testSet) testFeaturesExtractor.ExtractNumTfFeatures() self.classifier_Tasi.testFeatures = testFeaturesExtractor.features self.classifier_Tasi.testTargets = [] for i in range(len(self.classifier_Tasi.testFeatures)): #self.classifier_Tasi.testTargets[i] = 1 self.classifier_Tasi.testTargets.append(1) label, acc, val = self.classifier_Tasi.Test() else: configFileFeaturesExtractor = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-lexicon.xml') testFeaturesExtractor = FeaturesExtractor(configFileFeaturesExtractor, None, None, self.languageModel_lexicon, testSet) self.classifier_Lexicon.testFeatures = testFeaturesExtractor.features self.classifier_Lexicon.testTargets = [] for i in range(len(self.classifier_Lexicon.testFeatures)): self.classifier_Lexicon.testTargets[i] = 1 label, acc, val = self.classifier_Lexicon.Test() return label
def prepare_data(self, dataset): trainFeaturesExtractor = FeaturesExtractor( self.configFileFeaturesExtractor, self.trainFeaturesSerializationFile, self.trainLabelsSerializationFile, self.languageModel, dataset, sentiment_features=True) trainFeaturesExtractor.ExtractNumTfFeatures(init_dicts()) maxid = max([max(i.keys()) for i in trainFeaturesExtractor.features]) X = [] Y = [] for i, item in enumerate(trainFeaturesExtractor.features): itemx = [0 for _ in range(maxid)] l = [0, 0, 0] l[trainFeaturesExtractor.labels[i] - 1] = 1 for j in trainFeaturesExtractor.features[i]: v = trainFeaturesExtractor.features[i][j] itemx[j - 1] = v X.append(itemx) Y.append(trainFeaturesExtractor.labels[i]) trainFeaturesExtractor.dataset = [] trainFeaturesExtractor.features = [] trainFeaturesExtractor.labels = [] return X, Y
def RunFeatureExtractor(self, _Classifier, trainSet): # Configurations file xml of the features extractor configFileFeaturesExtractor = os.path.join( self.basePath, "FeaturesExtractor", "Configurations", "Configurations-" + _Classifier + ".xml") # The serialization file to save the features trainFeaturesSerializationFile = os.path.join(self.basePath, "FeaturesExtractor", "Output", "train_features.bin") trainLabelsSerializationFile = os.path.join(self.basePath, "FeaturesExtractor", "Output", "train_labels.bin") # Start the FeaturesExtractor: #----------------------------- # Initialize the FeaturesExtractor trainFeaturesExtractor = FeaturesExtractor( configFileFeaturesExtractor, trainFeaturesSerializationFile, trainLabelsSerializationFile, self.languageModel, trainSet) if _Classifier == "Lexicon": trainFeaturesExtractor.ExtractLexiconFeatures() else: trainFeaturesExtractor.ExtractNumTfFeatures() return trainFeaturesExtractor
def prepare_data(self, dataset): trainFeaturesExtractor = FeaturesExtractor( self.configFileFeaturesExtractor, self.trainFeaturesSerializationFile, self.trainLabelsSerializationFile, self.languageModel, dataset, questions_features=True) print("Data length: ", len(dataset)) words_dict = self.datasetBuilder.getQuestionsDatasetDictionary( self.words_dict_path) trainFeaturesExtractor.ExtractNumTfFeatures(questions_dict=words_dict) maxid = max([max(i.keys()) for i in trainFeaturesExtractor.features]) X = [] Y = [] L = len(dataset) for i, item in enumerate(trainFeaturesExtractor.features): itemx = [0 for _ in range(maxid)] l = [0, 0, 0] l[trainFeaturesExtractor.labels[i] - 1] = 1 for j in trainFeaturesExtractor.features[i]: v = trainFeaturesExtractor.features[i][j] itemx[j - 1] = v X.append(itemx) Y.append(trainFeaturesExtractor.labels[i]) return X, Y, L
def evaluate(cls, path, use_backend=True, pre_stocks=None): validation_accuracy = {} global stocks if pre_stocks: stocks = pre_stocks for stockName in stocks: model = cls.load(path, stockName) if not model: continue configFileDatasetBuilder = os.path.join('DatasetBuilder', 'Configurations', 'Configurations.xml') # The serialization file to save the dataset datasetSerializationFile = os.path.join('DatasetBuilder', 'Output', 'dataset.bin') # The XLSX file name for train set xlsxTrainFileName = os.path.join('DatasetBuilder', 'Input', 'train') # Initialize the DatasetBuilder from serialization file datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) if use_backend: testSet = datasetBuilder.GetDatasetFromBackend(stockName) else: testSet = datasetBuilder.GetDatasetFromXLSXFile( xlsxTrainFileName, stockName) if len(testSet) < NMIN_SET: continue testSet = testSet[:NVALID] print('Using model for %s' % stockName) configFileFeaturesExtractor = os.path.join( 'FeaturesExtractor', 'Configurations', 'Configurations-Tasi.xml') testFeaturesExtractor = FeaturesExtractor( configFileFeaturesExtractor, None, None, model['languageModel_lexicon'], testSet) #testFeaturesExtractor.ExtractLexiconFeatures() testFeaturesExtractor.ExtractNumTfFeatures(sparse=True) model[ 'classifier_Lexicon'].testFeatures = testFeaturesExtractor.sparse_features model[ 'classifier_Lexicon'].testTargets = testFeaturesExtractor.labels label, acc, val = model['classifier_Lexicon'].Test() print(acc, val) validation_accuracy[stockName] = { 'accuracy': acc, 'training_samples': model['training_samples'] } return validation_accuracy
def prepare_data(self, dataset): trainFeaturesExtractor = FeaturesExtractor(self.configFileFeaturesExtractor, self.trainFeaturesSerializationFile, self.trainLabelsSerializationFile, self.languageModel, dataset, sentiment_features=True) trainFeaturesExtractor.ExtractNumTfFeatures(sentiment_dict=init_dicts(), sparse=True) X= trainFeaturesExtractor.sparse_features Y = np.array(trainFeaturesExtractor.labels) trainFeaturesExtractor.dataset = [] trainFeaturesExtractor.features = [] trainFeaturesExtractor.labels = [] return X, Y
def classify(cls, text, stockName, path): model = cls.load(path, stockName) if not model: return Exception('Stock wasn\'t found') testSet = [] for t in text: testSet.append({'label': '', 'text': t}) print('Using model for %s' % stockName) configFileFeaturesExtractor = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-Tasi.xml') testFeaturesExtractor = FeaturesExtractor( configFileFeaturesExtractor, None, None, model['languageModel_lexicon'], testSet) #testFeaturesExtractor.ExtractLexiconFeatures() testFeaturesExtractor.ExtractNumTfFeatures(sparse=True) model[ 'classifier_Lexicon'].testFeatures = testFeaturesExtractor.sparse_features model['classifier_Lexicon'].testTargets = [] for i in range(model['classifier_Lexicon'].testFeatures.shape[0]): model['classifier_Lexicon'].testTargets.append(1) probs = model['classifier_Lexicon'].predict_propa() label = [] for prob in probs: print(prob) prob = prob[0] if abs(prob[0] - prob[1]) < 0.2: label.append(0) elif max(prob) < 0.4: label.append(0) elif prob[0] > prob[1]: label.append(1) elif prob[0] <= prob[1]: label.append(2) return label
def Classify(self, text): testSet = [] for t in text: testSet.append({'label': '', 'text': t}) # Configurations file xml of the features extractor configFileFeaturesExtractor = os.path.join( self.basePath, "FeaturesExtractor", "Configurations", "Configurations-" + self.usedClassifier + ".xml") testFeaturesExtractor = FeaturesExtractor(configFileFeaturesExtractor, None, None, self.languageModel, testSet) if self.usedClassifier == "Lexicon": testFeaturesExtractor.ExtractLexiconFeatures() else: testFeaturesExtractor.ExtractNumTfFeatures() self.classifier.testFeatures = testFeaturesExtractor.features self.classifier.testTargets = [] for i in range(len(self.classifier.testFeatures)): self.classifier.testTargets.append(1) label, acc, val = self.classifier.Test() return label
trainLabelsSerializationFile = ".\\FeaturesExtractor\\Output\\train_labels.bin" testFeaturesSerializationFile = ".\\FeaturesExtractor\\Output\\test_features.bin" testLabelsSerializationFile = ".\\FeaturesExtractor\\Output\\test_labels.bin" testExportFileName = ".\\FeaturesExtractor\\Output\\test_data.txt" trainExportFileName = ".\\FeaturesExtractor\\Output\\train_data.txt" # Start the FeaturesExtractor: #----------------------------- # Initialize the FeaturesExtractor trainFeaturesExtractor = FeaturesExtractor(configFileFeaturesExtractor, trainFeaturesSerializationFile, trainLabelsSerializationFile, languageModel, datasetBuilder.trainSet) #trainFeaturesExtractor.ExtractTFFeatures() trainFeaturesExtractor.ExtractNumTfFeatures() #trainFeaturesExtractor.ExtractTFIDFFeatures() #trainFeaturesExtractor.ExtractKLFeatures() #trainFeaturesExtractor.SaveFeatures() #trainFeaturesExtractor.SaveLabels() #trainFeaturesExtractor.DumpFeaturesToTxt(trainExportFileName) #testFeaturesExtractor = FeaturesExtractor(configFileFeaturesExtractor, testFeaturesSerializationFile, testLabelsSerializationFile, languageModel, datasetBuilder.testSet) #testFeaturesExtractor.ExtractTFFeatures() #testFeaturesExtractor.ExtractTFIDFFeatures() #testFeaturesExtractor.ExtractNumTfFeatures() #testFeaturesExtractor.ExtractKLFeatures() #testFeaturesExtractor.SaveFeatures() #testFeaturesExtractor.SaveLabels() #testFeaturesExtractor.DumpFeaturesToTxt(testExportFileName)
def __init__(self): ''' Constructor :type self: ''' # Start the DatasetBuilder #------------------------- # Configurations file xml of the dataset builder configFileDatasetBuilder = os.path.join('DatasetBuilder','Configurations','Configurations.xml') # The serialization file to save the dataset datasetSerializationFile = os.path.join('DatasetBuilder','Output', 'dataset.bin') # The XLSX file name for train set xlsxTrainFileName = os.path.join('DatasetBuilder','Input','train') # Initialize the DatasetBuilder from serialization file datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) datasetBuilder.trainSet = datasetBuilder.GetDatasetFromXLSXFile(xlsxTrainFileName) # Configurations file xml of the language model configFileLanguageModel_lexicon = os.path.join('LanguageModel', 'Configurations', 'Configurations-lexicon.xml') configFileLanguageModel_Tasi = os.path.join('LanguageModel', 'Configurations', 'Configurations-Tasi.xml') stopWordsFileName = os.path.join('LanguageModel', 'Input', 'stop_words.txt') linksDBFile = os.path.join('LanguageModel', 'Output', 'links_database.txt') # The serialization file to save the model languageModelSerializationFile = os.path.join('LanguageModel', 'Output', 'language_model.bin') # Start the LanguageModel: # Initialize the LanguageModel_Lexicon self.languageModel_lexicon = LanguageModel(configFileLanguageModel_lexicon, stopWordsFileName, languageModelSerializationFile, linksDBFile, datasetBuilder.trainSet) self.languageModel_lexicon.BuildLanguageModel() # Initialize the LanguageModel_Tasi self.languageModel_Tasi = LanguageModel(configFileLanguageModel_Tasi, stopWordsFileName, languageModelSerializationFile, linksDBFile, datasetBuilder.trainSet) self.languageModel_Tasi.BuildLanguageModel() # Configurations file xml of the features extractor configFileFeaturesExtractor_Lexicon = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-lexicon.xml') configFileFeaturesExtractor_Tasi = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-Tasi.xml') # The serialization file to save the features trainFeaturesSerializationFile = os.path.join('FeaturesExtractor', 'Output', 'train_features.bin') trainLabelsSerializationFile = os.path.join('FeaturesExtractor', 'Output', 'train_labels.bin') # Start the FeaturesExtractor: #----------------------------- # Initialize the FeaturesExtractor _ Lexicon trainFeaturesExtractor_Lexicon = FeaturesExtractor(configFileFeaturesExtractor_Lexicon, trainFeaturesSerializationFile, trainLabelsSerializationFile, self.languageModel_lexicon, datasetBuilder.trainSet) trainFeaturesExtractor_Lexicon.ExtractNumTfFeatures() # Initialize the FeaturesExtractor _ Tasi trainFeaturesExtractor_Tasi = FeaturesExtractor(configFileFeaturesExtractor_Tasi, trainFeaturesSerializationFile, trainLabelsSerializationFile, self.languageModel_Tasi, datasetBuilder.trainSet) trainFeaturesExtractor_Tasi.ExtractNumTfFeatures() # The serialization file to save the features configFileClassifier_Lexicon = os.path.join('Classifier', 'Configurations', 'Configurations-lexicon.xml') configFileClassifier_Tasi = os.path.join('Classifier', 'Configurations', 'Configurations-Tasi.xml') modelSerializationFile = os.path.join('Classifier', 'Output', 'classifier_model.bin') # Start the Classifier: #--------------------- print(trainFeaturesExtractor_Tasi.labels[:4]) print([i['label'] for i in trainFeaturesExtractor_Lexicon.dataSet[:4]]) self.classifier_Lexicon = Classifier(configFileClassifier_Lexicon, modelSerializationFile, trainFeaturesExtractor_Lexicon.features, trainFeaturesExtractor_Lexicon.labels, [], []) self.classifier_Tasi = Classifier(configFileClassifier_Tasi, modelSerializationFile, trainFeaturesExtractor_Tasi.features, trainFeaturesExtractor_Tasi.labels, [],[]) # Train self.classifier_Lexicon.Train() self.classifier_Tasi.Train()
def init(cls, save_path, use_backend=True, pre_stocks=None): ''' Constructor :type self: ''' global stocks if pre_stocks: stocks = pre_stocks for stock in stocks: print('Buildind model for %s' % stock) stock_model = {} # Start the DatasetBuilder #------------------------- # Configurations file xml of the dataset builder configFileDatasetBuilder = os.path.join('DatasetBuilder', 'Configurations', 'Configurations.xml') # The serialization file to save the dataset datasetSerializationFile = os.path.join('DatasetBuilder', 'Output', 'dataset.bin') # The XLSX file name for train set xlsxTrainFileName = os.path.join('DatasetBuilder', 'Input', 'train') # Initialize the DatasetBuilder from serialization file datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) if use_backend: datasetBuilder.trainSet = datasetBuilder.GetDatasetFromBackend( stock) else: datasetBuilder.trainSet = datasetBuilder.GetDatasetFromXLSXFile( xlsxTrainFileName, stock) if len(datasetBuilder.trainSet) < NMIN_SET: print("Not enough data: ", len(datasetBuilder.trainSet)) continue datasetBuilder.trainSet = datasetBuilder.trainSet[NVALID:] # Configurations file xml of the language model configFileLanguageModel_lexicon = os.path.join( 'LanguageModel', 'Configurations', 'Configurations-Tasi.xml') stopWordsFileName = os.path.join('LanguageModel', 'Input', 'stop_words.txt') linksDBFile = os.path.join('LanguageModel', 'Output', 'links_database.txt') # The serialization file to save the model languageModelSerializationFile = os.path.join( 'LanguageModel', 'Output', 'language_model.bin') # Start the LanguageModel: # Initialize the LanguageModel_Lexicon stock_model['languageModel_lexicon'] = LanguageModel( configFileLanguageModel_lexicon, stopWordsFileName, languageModelSerializationFile, linksDBFile, datasetBuilder.trainSet) stock_model['languageModel_lexicon'].BuildLanguageModel() # Configurations file xml of the features extractor configFileFeaturesExtractor_Lexicon = os.path.join( 'FeaturesExtractor', 'Configurations', 'Configurations-Tasi.xml') # The serialization file to save the features trainFeaturesSerializationFile = os.path.join( 'FeaturesExtractor', 'Output', 'train_features.bin') trainLabelsSerializationFile = os.path.join( 'FeaturesExtractor', 'Output', 'train_labels.bin') # Start the FeaturesExtractor: #----------------------------- # Initialize the FeaturesExtractor _ Lexicon trainFeaturesExtractor_Lexicon = FeaturesExtractor( configFileFeaturesExtractor_Lexicon, trainFeaturesSerializationFile, trainLabelsSerializationFile, stock_model['languageModel_lexicon'], datasetBuilder.trainSet) trainFeaturesExtractor_Lexicon.ExtractNumTfFeatures(sparse=True) #print(trainFeaturesExtractor_Lexicon.features[0]) # The serialization file to save the features configFileClassifier_Lexicon = os.path.join( 'Classifier', 'Configurations', 'Configurations-Tasi.xml') modelSerializationFile = os.path.join('Classifier', 'Output', 'classifier_model.bin') # Start the Classifier: #--------------------- stock_model['classifier_Lexicon'] = Classifier( configFileClassifier_Lexicon, modelSerializationFile, trainFeaturesExtractor_Lexicon.sparse_features, trainFeaturesExtractor_Lexicon.labels, [], []) #stock_model['classifier_Lexicon'] = Classifier(configFileClassifier_Lexicon, modelSerializationFile, trainFeaturesExtractor_Lexicon.features, trainFeaturesExtractor_Lexicon.labels, [], []) #print(trainFeaturesExtractor_Lexicon.labels[:4]) #print([i['label'] for i in trainFeaturesExtractor_Lexicon.dataSet[:4]]) # Train stock_model['classifier_Lexicon'].Train() stock_model['training_samples'] = len(datasetBuilder.trainSet) cls.save(save_path, stock, stock_model) print("----------------------------------------------------")