def get_data(self, backend=True): configFileDatasetBuilder = os.path.join('DatasetBuilder', 'Configurations', 'Configurations.xml') datasetSerializationFile = os.path.join('DatasetBuilder', 'Output', 'dataset.bin') self.datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) dataset = None #if not backend: xlsxTrainFileName = os.path.join('DatasetBuilder', 'Input', 'sentiment') dataset = self.datasetBuilder.GetSentimentDatasetFromXLSXFile( xlsxTrainFileName) if backend: dataset2 = self.datasetBuilder.GetSentimentDatasetFromBackend() for item in dataset2: dataset[item] = dataset2[item] dataset = list(dataset.values()) if len(dataset) < MIN_DATA: return print("Data length: ", len(dataset)) self.languageModel.dataset = dataset self.languageModel.totalNumberOfDocs = len(dataset) self.languageModel.BuildLanguageModel() self.languageModel.dataset = [] return dataset
def evaluate(cls, path, use_backend=True, pre_stocks=None): validation_accuracy = {} global stocks if pre_stocks: stocks = pre_stocks for stockName in stocks: model = cls.load(path, stockName) if not model: continue configFileDatasetBuilder = os.path.join('DatasetBuilder', 'Configurations', 'Configurations.xml') # The serialization file to save the dataset datasetSerializationFile = os.path.join('DatasetBuilder', 'Output', 'dataset.bin') # The XLSX file name for train set xlsxTrainFileName = os.path.join('DatasetBuilder', 'Input', 'train') # Initialize the DatasetBuilder from serialization file datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) if use_backend: testSet = datasetBuilder.GetDatasetFromBackend(stockName) else: testSet = datasetBuilder.GetDatasetFromXLSXFile( xlsxTrainFileName, stockName) if len(testSet) < NMIN_SET: continue testSet = testSet[:NVALID] print('Using model for %s' % stockName) configFileFeaturesExtractor = os.path.join( 'FeaturesExtractor', 'Configurations', 'Configurations-Tasi.xml') testFeaturesExtractor = FeaturesExtractor( configFileFeaturesExtractor, None, None, model['languageModel_lexicon'], testSet) #testFeaturesExtractor.ExtractLexiconFeatures() testFeaturesExtractor.ExtractNumTfFeatures(sparse=True) model[ 'classifier_Lexicon'].testFeatures = testFeaturesExtractor.sparse_features model[ 'classifier_Lexicon'].testTargets = testFeaturesExtractor.labels label, acc, val = model['classifier_Lexicon'].Test() print(acc, val) validation_accuracy[stockName] = { 'accuracy': acc, 'training_samples': model['training_samples'] } return validation_accuracy
def get_data(self): configFileDatasetBuilder = os.path.join('DatasetBuilder', 'Configurations', 'Configurations.xml') datasetSerializationFile = os.path.join('DatasetBuilder', 'Output', 'dataset.bin') self.datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) dataset = self.datasetBuilder.getQuestionsDataset(self.dataset_path) self.languageModel.dataset = dataset self.languageModel.totalNumberOfDocs = len(dataset) self.languageModel.BuildLanguageModel() self.languageModel.dataset = [] return dataset
def init_dicts(): configFileDatasetBuilder = os.path.join('DatasetBuilder','Configurations','Configurations.xml') datasetSerializationFile = os.path.join('DatasetBuilder','Output', 'dataset.bin') xlsxTrainFileName = os.path.join('DatasetBuilder','Input','sentiment') datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) datasetBuilder.trainSet = datasetBuilder.GetSentimentDatasetFromXLSXFile(xlsxTrainFileName).values() words_dict = {'negative':[], 'positive': [], 'neutral': []} for item in datasetBuilder.trainSet: words_dict[item['label']] += item['words'] for k in words_dict: words_dict[k] = list(set(words_dict[k])) return words_dict
def __init__(self, basePath, stockName, Retrain): ''' Constructor :type self: ''' if (basePath == None): self.basePath = self.basePath else: self.basePath = basePath self.stockName = stockName serializationFile = open( os.path.join(self.basePath, 'StockToClassifier.bin'), 'rb') self.StockToClassifier = pickle.load(serializationFile) #import pdb; pdb.set_trace() self.usedClassifier = self.StockToClassifier[self.stockName] # Start the DatasetBuilder #------------------------- # Configurations file xml of the dataset builder configFileDatasetBuilder = os.path.join(self.basePath, "DatasetBuilder", "Configurations", "Configurations.xml") # The serialization file to save the dataset datasetSerializationFile = os.path.join(self.basePath, "DatasetBuilder", "Output", "dataset.bin") if Retrain == False: # The XLSX file name for train set xlsxTrainFileName = os.path.join(self.basePath, "DatasetBuilder", "Input", "train") # Initialize the DatasetBuilder from serialization file datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) datasetBuilder.trainSet = datasetBuilder.GetDatasetFromXLSXFile( xlsxTrainFileName) self.RunLanguageModel(self.usedClassifier, datasetBuilder.trainSet) trainFeaturesExtractor = self.RunFeatureExtractor( self.usedClassifier, datasetBuilder.trainSet) self.Train(self.usedClassifier, trainFeaturesExtractor, True) else: # Initialize the DatasetBuilder from serialization file datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile)
configFileDatasetBuilder = ".\\DatasetBuilder\\Configurations\\Configurations.xml" # The CSV file name for tweets to be manually labeled csvManualLabelsFileName = ".\\DatasetBuilder\\Output\\ManualLabels" xlsxManualLabelsFileName = ".\\DatasetBuilder\\Output\\ManualLabels" # The serialization file to save the dataset datasetSerializationFile = ".\\DatasetBuilder\\Output\\dataset.bin" # Train/Test serialization file trainTestSerializationFile = ".\\DatasetBuilder\\Output\\train_test_dataset.bin" # Check if the current stage is to initialize random labels if LOAD_DATASET_FROM_SERIALIZATION_FILE: # Initialize the DatasetBuilder from serialization file datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) # Load the dataset datasetBuilder.LoadDataset() # Form or load the train/test sets if SPLIT_DATASET_TRAIN_TEST: datasetBuilder.SplitTrainTest() datasetBuilder.SaveTrainTestDataset(trainTestSerializationFile) elif LOAD_TRAIN_TEST: datasetBuilder.LoadTrainTestDataset(trainTestSerializationFile) elif UPDATE_LABELS_FROM_CSV: # Initialize the DatasetBuilder from serialization file datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile)
# The XLSX file name for tweets to be manually labeled xlsxManualLabelsFileName = ".\\DatasetBuilder\\Output\\Completed\\ManualLabels" # The serialization file to save the dataset datasetSerializationFile = ".\\DatasetBuilder\\Output\\dataset.bin" # Train/Test serialization file trainTestSerializationFile = ".\\DatasetBuilder\\Output\\train_test_dataset.bin" # The XLSX file name for train set xlsxTrainFileName = ".\\DatasetBuilder\\Input\\train" xlsxTestFileName = ".\\DatasetBuilder\\Input\\test" # Initialize the DatasetBuilder from serialization file datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) # Load the dataset #datasetBuilder.LoadDataset() # Update the labels ''' numFiles = 50 for i in range(numFiles): print('Updating labels from file ' + xlsxManualLabelsFileName + "_" + str(i + 1) + '...') datasetBuilder.UpdateManualLabelsFromXLSXFile(xlsxManualLabelsFileName + "_" + str(i + 1), (i + 1)) # This should be done separately when dataset is manually labeled # Form or load the train/test sets datasetBuilder.SplitTrainTest() ''' datasetBuilder.trainSet = datasetBuilder.GetDatasetFromXLSXFile(
f_in = open('.\\TwitterCrawler\\stocks.txt', 'r', encoding='utf-8') lines = f_in.readlines() queryArray = [] stock_under_test = 4 i = 1 for line in lines: if (i == stock_under_test): queryArray.append(line.strip()) print(line.strip() + "\n") break i += 1 # Check if the current stage is to initialize random labels if LOAD_DATASET_FROM_SERIALIZATION_FILE: # Initialize the DatasetBuilder from serialization file datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) # Load the dataset datasetBuilder.LoadDataset() # Form or load the train/test sets if SPLIT_DATASET_TRAIN_TEST: datasetBuilder.SplitTrainTest() datasetBuilder.SaveTrainTestDataset(trainTestSerializationFile) elif LOAD_TRAIN_TEST: datasetBuilder.LoadTrainTestDataset(trainTestSerializationFile) elif UPDATE_LABELS_FROM_CSV: # Initialize the DatasetBuilder from serialization file datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile)
dirName = '.\\crawler\\news' for file in os.listdir(dirName): if file.endswith(".csv"): full_file_name = dirName + '\\' + file d.csvNewsFileName = full_file_name news_headlines.extend(d.get_news_headlines()) def CollectPrices(): dirName = '.\\crawler\\prices' for file in os.listdir('.\\crawler\\prices'): if file.endswith(".csv"): full_file_name = dirName + '\\' + file print(full_file_name) d.csvPricesFileName = full_file_name prices.extend(d.get_prices()) d = DatasetBuilder() CollectNews() CollectPrices() d.csvNewsFileName = 'news_all.csv' d.DumpNewsCSV(news_headlines) d.csvPricesFileName = 'prices_all.csv' d.DumpPricesCSV(prices)
def __init__(self): ''' Constructor :type self: ''' # Start the DatasetBuilder #------------------------- # Configurations file xml of the dataset builder configFileDatasetBuilder = os.path.join('DatasetBuilder','Configurations','Configurations.xml') # The serialization file to save the dataset datasetSerializationFile = os.path.join('DatasetBuilder','Output', 'dataset.bin') # The XLSX file name for train set xlsxTrainFileName = os.path.join('DatasetBuilder','Input','train') # Initialize the DatasetBuilder from serialization file datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) datasetBuilder.trainSet = datasetBuilder.GetDatasetFromXLSXFile(xlsxTrainFileName) # Configurations file xml of the language model configFileLanguageModel_lexicon = os.path.join('LanguageModel', 'Configurations', 'Configurations-lexicon.xml') configFileLanguageModel_Tasi = os.path.join('LanguageModel', 'Configurations', 'Configurations-Tasi.xml') stopWordsFileName = os.path.join('LanguageModel', 'Input', 'stop_words.txt') linksDBFile = os.path.join('LanguageModel', 'Output', 'links_database.txt') # The serialization file to save the model languageModelSerializationFile = os.path.join('LanguageModel', 'Output', 'language_model.bin') # Start the LanguageModel: # Initialize the LanguageModel_Lexicon self.languageModel_lexicon = LanguageModel(configFileLanguageModel_lexicon, stopWordsFileName, languageModelSerializationFile, linksDBFile, datasetBuilder.trainSet) self.languageModel_lexicon.BuildLanguageModel() # Initialize the LanguageModel_Tasi self.languageModel_Tasi = LanguageModel(configFileLanguageModel_Tasi, stopWordsFileName, languageModelSerializationFile, linksDBFile, datasetBuilder.trainSet) self.languageModel_Tasi.BuildLanguageModel() # Configurations file xml of the features extractor configFileFeaturesExtractor_Lexicon = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-lexicon.xml') configFileFeaturesExtractor_Tasi = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-Tasi.xml') # The serialization file to save the features trainFeaturesSerializationFile = os.path.join('FeaturesExtractor', 'Output', 'train_features.bin') trainLabelsSerializationFile = os.path.join('FeaturesExtractor', 'Output', 'train_labels.bin') # Start the FeaturesExtractor: #----------------------------- # Initialize the FeaturesExtractor _ Lexicon trainFeaturesExtractor_Lexicon = FeaturesExtractor(configFileFeaturesExtractor_Lexicon, trainFeaturesSerializationFile, trainLabelsSerializationFile, self.languageModel_lexicon, datasetBuilder.trainSet) trainFeaturesExtractor_Lexicon.ExtractNumTfFeatures() # Initialize the FeaturesExtractor _ Tasi trainFeaturesExtractor_Tasi = FeaturesExtractor(configFileFeaturesExtractor_Tasi, trainFeaturesSerializationFile, trainLabelsSerializationFile, self.languageModel_Tasi, datasetBuilder.trainSet) trainFeaturesExtractor_Tasi.ExtractNumTfFeatures() # The serialization file to save the features configFileClassifier_Lexicon = os.path.join('Classifier', 'Configurations', 'Configurations-lexicon.xml') configFileClassifier_Tasi = os.path.join('Classifier', 'Configurations', 'Configurations-Tasi.xml') modelSerializationFile = os.path.join('Classifier', 'Output', 'classifier_model.bin') # Start the Classifier: #--------------------- print(trainFeaturesExtractor_Tasi.labels[:4]) print([i['label'] for i in trainFeaturesExtractor_Lexicon.dataSet[:4]]) self.classifier_Lexicon = Classifier(configFileClassifier_Lexicon, modelSerializationFile, trainFeaturesExtractor_Lexicon.features, trainFeaturesExtractor_Lexicon.labels, [], []) self.classifier_Tasi = Classifier(configFileClassifier_Tasi, modelSerializationFile, trainFeaturesExtractor_Tasi.features, trainFeaturesExtractor_Tasi.labels, [],[]) # Train self.classifier_Lexicon.Train() self.classifier_Tasi.Train()
''' Created on Mar 23, 2015 @author: aelsalla ''' from DatasetBuilder.DatasetBuilder import DatasetBuilder from FeaturesExtractor.FeaturesExtractor import FeaturesExtractor from Classifier.Classifier import Classifier import matplotlib.pyplot as plt import numpy as np # Initialize the DatasetBuilder ############################### dataSetBuilder = DatasetBuilder() testSetShare = 0.1 dataSetBuilder.csvPricesFileName = '.\\crawler\\prices\\prices_16_4_2015_15_30_55.csv' dataSetBuilder.csvNewsFileName = '.\\news_all.csv' trainSet, testSet = dataSetBuilder.BuildDataSet(testSetShare) dataSet = [] dataSet.extend(trainSet) dataSet.extend(testSet) ''' fullPrices = [] for price in dataSetBuilder.get_prices(): fullPrices.append(float(price['value'])) ''' fullPrices = [] labels = [] sizes = []
def init(cls, save_path, use_backend=True, pre_stocks=None): ''' Constructor :type self: ''' global stocks if pre_stocks: stocks = pre_stocks for stock in stocks: print('Buildind model for %s' % stock) stock_model = {} # Start the DatasetBuilder #------------------------- # Configurations file xml of the dataset builder configFileDatasetBuilder = os.path.join('DatasetBuilder', 'Configurations', 'Configurations.xml') # The serialization file to save the dataset datasetSerializationFile = os.path.join('DatasetBuilder', 'Output', 'dataset.bin') # The XLSX file name for train set xlsxTrainFileName = os.path.join('DatasetBuilder', 'Input', 'train') # Initialize the DatasetBuilder from serialization file datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) if use_backend: datasetBuilder.trainSet = datasetBuilder.GetDatasetFromBackend( stock) else: datasetBuilder.trainSet = datasetBuilder.GetDatasetFromXLSXFile( xlsxTrainFileName, stock) if len(datasetBuilder.trainSet) < NMIN_SET: print("Not enough data: ", len(datasetBuilder.trainSet)) continue datasetBuilder.trainSet = datasetBuilder.trainSet[NVALID:] # Configurations file xml of the language model configFileLanguageModel_lexicon = os.path.join( 'LanguageModel', 'Configurations', 'Configurations-Tasi.xml') stopWordsFileName = os.path.join('LanguageModel', 'Input', 'stop_words.txt') linksDBFile = os.path.join('LanguageModel', 'Output', 'links_database.txt') # The serialization file to save the model languageModelSerializationFile = os.path.join( 'LanguageModel', 'Output', 'language_model.bin') # Start the LanguageModel: # Initialize the LanguageModel_Lexicon stock_model['languageModel_lexicon'] = LanguageModel( configFileLanguageModel_lexicon, stopWordsFileName, languageModelSerializationFile, linksDBFile, datasetBuilder.trainSet) stock_model['languageModel_lexicon'].BuildLanguageModel() # Configurations file xml of the features extractor configFileFeaturesExtractor_Lexicon = os.path.join( 'FeaturesExtractor', 'Configurations', 'Configurations-Tasi.xml') # The serialization file to save the features trainFeaturesSerializationFile = os.path.join( 'FeaturesExtractor', 'Output', 'train_features.bin') trainLabelsSerializationFile = os.path.join( 'FeaturesExtractor', 'Output', 'train_labels.bin') # Start the FeaturesExtractor: #----------------------------- # Initialize the FeaturesExtractor _ Lexicon trainFeaturesExtractor_Lexicon = FeaturesExtractor( configFileFeaturesExtractor_Lexicon, trainFeaturesSerializationFile, trainLabelsSerializationFile, stock_model['languageModel_lexicon'], datasetBuilder.trainSet) trainFeaturesExtractor_Lexicon.ExtractNumTfFeatures(sparse=True) #print(trainFeaturesExtractor_Lexicon.features[0]) # The serialization file to save the features configFileClassifier_Lexicon = os.path.join( 'Classifier', 'Configurations', 'Configurations-Tasi.xml') modelSerializationFile = os.path.join('Classifier', 'Output', 'classifier_model.bin') # Start the Classifier: #--------------------- stock_model['classifier_Lexicon'] = Classifier( configFileClassifier_Lexicon, modelSerializationFile, trainFeaturesExtractor_Lexicon.sparse_features, trainFeaturesExtractor_Lexicon.labels, [], []) #stock_model['classifier_Lexicon'] = Classifier(configFileClassifier_Lexicon, modelSerializationFile, trainFeaturesExtractor_Lexicon.features, trainFeaturesExtractor_Lexicon.labels, [], []) #print(trainFeaturesExtractor_Lexicon.labels[:4]) #print([i['label'] for i in trainFeaturesExtractor_Lexicon.dataSet[:4]]) # Train stock_model['classifier_Lexicon'].Train() stock_model['training_samples'] = len(datasetBuilder.trainSet) cls.save(save_path, stock, stock_model) print("----------------------------------------------------")
from DatasetBuilder.DatasetBuilder import DatasetBuilder import datetime, time import os os.environ["DJANGO_SETTINGS_MODULE"] = "website.settings" from app.models import NewsHeadline, Price # Open the file priceStartDate = '2015-03-01' while True: logFile = open('crawl_log_file.txt', 'a') print('Crawling now ' + str(datetime.datetime.now())) logFile.write('Crawling now ' + str(datetime.datetime.now()) +'\n') logFile.close() d = DatasetBuilder() news_headlines = d.ParseNewsURL() for headline in news_headlines: print(headline['text'] + '\n' + headline['time_stamp']) headline_exist = NewsHeadline.objects.filter(text=headline['text']) if(len(headline_exist) == 0): headline_entry = NewsHeadline() headline_entry.text = headline['text'] headline_entry.time_stamp = headline['time_stamp'] headline_entry.save() d.csvNewsFileName = '.\\crawler\\news\\news_' + str(datetime.datetime.now().day) +'_' + str(datetime.datetime.now().month) +'_' + str(datetime.datetime.now().year) +'_' + str(datetime.datetime.now().hour) +'_' + str(datetime.datetime.now().minute) + '_' + str(datetime.datetime.now().second) +'.csv'
class SentimentModel(object): def __init__(self, modeln=1): self.modeln = modeln configFileLanguageModel = os.path.join('LanguageModel', 'Configurations', 'Configurations_sentiment.xml') stopWordsFileName = os.path.join('LanguageModel', 'Input', 'stop_words.txt') linksDBFile = os.path.join('LanguageModel', 'Output', 'links_database.txt') languageModelSerializationFile = os.path.join('LanguageModel', 'Output', 'language_model.bin') self.languageModel = LanguageModel(configFileLanguageModel, stopWordsFileName, languageModelSerializationFile, linksDBFile, []) self.configFileFeaturesExtractor = os.path.join( 'FeaturesExtractor', 'Configurations', 'Configurations_sentiment.xml') self.trainFeaturesSerializationFile = os.path.join( 'FeaturesExtractor', 'Output', 'train_features.bin') self.trainLabelsSerializationFile = os.path.join( 'FeaturesExtractor', 'Output', 'train_labels.bin') def get_data(self, backend=True): configFileDatasetBuilder = os.path.join('DatasetBuilder', 'Configurations', 'Configurations.xml') datasetSerializationFile = os.path.join('DatasetBuilder', 'Output', 'dataset.bin') self.datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) dataset = None #if not backend: xlsxTrainFileName = os.path.join('DatasetBuilder', 'Input', 'sentiment') dataset = self.datasetBuilder.GetSentimentDatasetFromXLSXFile( xlsxTrainFileName) if backend: dataset2 = self.datasetBuilder.GetSentimentDatasetFromBackend() for item in dataset2: dataset[item] = dataset2[item] dataset = list(dataset.values()) if len(dataset) < MIN_DATA: return print("Data length: ", len(dataset)) self.languageModel.dataset = dataset self.languageModel.totalNumberOfDocs = len(dataset) self.languageModel.BuildLanguageModel() self.languageModel.dataset = [] return dataset def prepare_data(self, dataset): trainFeaturesExtractor = FeaturesExtractor( self.configFileFeaturesExtractor, self.trainFeaturesSerializationFile, self.trainLabelsSerializationFile, self.languageModel, dataset, sentiment_features=True) trainFeaturesExtractor.ExtractNumTfFeatures(init_dicts()) maxid = max([max(i.keys()) for i in trainFeaturesExtractor.features]) X = [] Y = [] for i, item in enumerate(trainFeaturesExtractor.features): itemx = [0 for _ in range(maxid)] l = [0, 0, 0] l[trainFeaturesExtractor.labels[i] - 1] = 1 for j in trainFeaturesExtractor.features[i]: v = trainFeaturesExtractor.features[i][j] itemx[j - 1] = v X.append(itemx) Y.append(trainFeaturesExtractor.labels[i]) trainFeaturesExtractor.dataset = [] trainFeaturesExtractor.features = [] trainFeaturesExtractor.labels = [] return X, Y def transform_data(self, X, Y): X = np.array(X) Y = np.array(Y) ri = range(X.shape[0]) rl = range(X.shape[1]) d = pd.DataFrame(X, index=ri, columns=rl) d['class'] = Y return d def split_data(self, d): training_indices, testing_indices = train_test_split( d.index, stratify=d['class'].values, train_size=0.75, test_size=0.25) return training_indices, testing_indices def train(self, backend=True): rawdata = self.get_data(backend) dxy = self.prepare_data(rawdata) d = self.transform_data(dxy[0], dxy[1]) self.training_indices, self.testing_indices = self.split_data(d) X = d.loc[self.training_indices].drop('class', axis=1).values Y = d.loc[self.training_indices, 'class'].values Xtest = d.loc[self.testing_indices].drop('class', axis=1).values Ytest = d.loc[self.testing_indices, 'class'].values if self.modeln == 1: print(self.fit_model1(X, Y)) print(self.evaluate_model1(Xtest, Ytest)) if self.modeln == 2: print(self.fit_model2(X, Y)) print(self.evaluate_model2(Xtest, Ytest)) if self.modeln == 3: print(self.fit_model3(X, Y)) print(self.evaluate_model3(Xtest, Ytest)) if self.modeln == 4: print(self.fit_model4(X, Y)) print(self.evaluate_model4(Xtest, Ytest)) def fit_model1(self, X, Y): self.model1 = LinearSVC(C=0.01, penalty="l1", dual=False, random_state=42) self.model1.fit(X, Y) recall = self.model1.score(X, Y) return recall def evaluate_model1(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def fit_model2(self, X, Y): self.model1 = LinearSVC(C=0.18, penalty="l1", dual=False, random_state=42) self.model1.fit(X, Y) recall = self.model1.score(X, Y) return recall def evaluate_model2(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def fit_model3(self, X, Y): pre_recall = 0.0 for g in [0.01, 0.05, 0.1, 0.3, 0.5]: model = SVC(C=0.18, gamma=g, random_state=42) model.fit(X, Y) recall = model.score(X, Y) print(recall) if recall > pre_recall: pre_recall = recall self.model1 = model return recall def evaluate_model3(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def fit_model4(self, X, Y): model = SVC(C=0.18, gamma=0.1, random_state=42) model.fit(X, Y) recall = model.score(X, Y) self.model1 = model return recall def evaluate_model4(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def classify(self, tweets): dataset = [] for tw in tweets: dataset.append({'text': tw, 'label': 'neutral'}) X, Y = self.prepare_data(dataset) return self.model1.predict(X) @classmethod def load(cls, path): return pickle.load(open(path, 'rb')) def save(self, path): return pickle.dump(self, open(path, 'wb'))
class SentimentModel(object): def __init__(self, modeln = 1): self.modeln = modeln configFileLanguageModel = os.path.join('LanguageModel', 'Configurations', 'Configurations_sentiment.xml') stopWordsFileName = os.path.join('LanguageModel', 'Input', 'stop_words.txt') linksDBFile = os.path.join('LanguageModel', 'Output', 'links_database.txt') languageModelSerializationFile = os.path.join('LanguageModel', 'Output', 'language_model.bin') self.languageModel = LanguageModel(configFileLanguageModel, stopWordsFileName, languageModelSerializationFile, linksDBFile, []) self.configFileFeaturesExtractor = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations_sentiment.xml') self.trainFeaturesSerializationFile = os.path.join('FeaturesExtractor', 'Output', 'train_features.bin') self.trainLabelsSerializationFile = os.path.join('FeaturesExtractor', 'Output', 'train_labels.bin') def get_data(self, backend=True): configFileDatasetBuilder = os.path.join('DatasetBuilder','Configurations','Configurations.xml') datasetSerializationFile = os.path.join('DatasetBuilder','Output', 'dataset.bin') self.datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) dataset = None #if not backend: xlsxTrainFileName = os.path.join('DatasetBuilder','Input','sentiment') dataset = self.datasetBuilder.GetSentimentDatasetFromXLSXFile(xlsxTrainFileName) if backend: dataset2 = self.datasetBuilder.GetSentimentDatasetFromBackend() for item in dataset2: dataset[item] = dataset2[item] dataset = list(dataset.values()) if len(dataset) < MIN_DATA: return print("Data length: ", len(dataset)) self.languageModel.dataset = dataset self.languageModel.totalNumberOfDocs = len(dataset) self.languageModel.BuildLanguageModel() self.languageModel.dataset = [] return dataset def prepare_data(self, dataset): trainFeaturesExtractor = FeaturesExtractor(self.configFileFeaturesExtractor, self.trainFeaturesSerializationFile, self.trainLabelsSerializationFile, self.languageModel, dataset, sentiment_features=True) trainFeaturesExtractor.ExtractNumTfFeatures(sentiment_dict=init_dicts(), sparse=True) X= trainFeaturesExtractor.sparse_features Y = np.array(trainFeaturesExtractor.labels) trainFeaturesExtractor.dataset = [] trainFeaturesExtractor.features = [] trainFeaturesExtractor.labels = [] return X, Y def transform_data(self, X, Y): X = np.array(X) Y = np.array(Y) ri = range(X.shape[0]) rl = range(X.shape[1]) d = pd.DataFrame(X, index=ri, columns=rl) d['class'] = Y return d def split_data(self, X, Y): training_indices, testing_indices = train_test_split(range(X.shape[0]), stratify = Y, train_size=0.75, test_size=0.25) self.ntraining_samples = len(training_indices) return training_indices, testing_indices def train(self, backend=True): rawdata = self.get_data(backend) Xall, Yall = self.prepare_data(rawdata) self.training_indices, self.testing_indices = self.split_data(Xall, Yall) X = Xall[self.training_indices] Y = Yall[self.training_indices] Xtest = Xall[self.testing_indices] Ytest = Yall[self.testing_indices] acc = 0.0 if self.modeln == 1: print(self.fit_model1(X, Y)) acc = self.evaluate_model1(Xtest, Ytest) if self.modeln == 2: print(self.fit_model2(X, Y)) acc = self.evaluate_model2(Xtest, Ytest) if self.modeln == 3: print(self.fit_model3(X, Y)) acc = self.evaluate_model3(Xtest, Ytest) if self.modeln == 4: print(self.fit_model4(X, Y)) acc = self.evaluate_model4(Xtest, Ytest) if self.modeln == 5: print(self.fit_model5(X, Y)) acc = self.evaluate_model5(Xtest, Ytest) result = {'accuracy': acc, 'training_samples': self.ntraining_samples} return result def fit_model1(self, X, Y): self.model1 = LinearSVC(C=0.018, dual=False, random_state=42) self.model1.fit(X, Y) recall = self.model1.score(X, Y) return recall def evaluate_model1(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def fit_model2(self, X, Y): self.model1 = LinearSVC(C=0.18, penalty="l1", dual=False, random_state=42) self.model1.fit(X, Y) recall = self.model1.score(X, Y) return recall def evaluate_model2(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def fit_model3(self, X, Y): pre_recall = 0.0 for g in [0.01, 0.05, 0.1, 0.3, 0.5]: model = SVC(C=0.18, gamma=g, random_state=42) model.fit(X, Y) recall = model.score(X, Y) print(recall) if recall > pre_recall: pre_recall = recall self.model1 = model return recall def evaluate_model3(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def fit_model4(self, X, Y): model = SVC(C=0.18, gamma=0.1, random_state=42) model.fit(X, Y) recall = model.score(X, Y) self.model1 = model return recall def evaluate_model4(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def fit_model5(self, X, Y): model = LogisticRegression(C=0.18, random_state=42) model.fit(X, Y) recall = model.score(X, Y) self.model1 = model return recall def evaluate_model5(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def classify(self, tweets): dataset = [] for tw in tweets: dataset.append({'text': tw, 'label':'neutral'}) X, Y = self.prepare_data(dataset) return self.model1.predict(X) @classmethod def load(cls, path): return pickle.load(open(path, 'rb')) def save(self, path): return pickle.dump(self, open(path, 'wb'))
class QuestionsModel(object): def __init__(self, words_dict_path=None, dataset_path=None, modeln=1): if not words_dict_path: words_dict_path = os.path.join('data', 'questions_dict.bin') if not dataset_path: dataset_path = os.path.join('data', 'questions_dataset.bin') self.modeln = modeln self.words_dict_path = words_dict_path self.dataset_path = dataset_path configFileLanguageModel = os.path.join('LanguageModel', 'Configurations', 'Configurations_questions.xml') stopWordsFileName = os.path.join('LanguageModel', 'Input', 'stop_words.txt') linksDBFile = os.path.join('LanguageModel', 'Output', 'links_database.txt') languageModelSerializationFile = os.path.join('LanguageModel', 'Output', 'language_model.bin') self.languageModel = LanguageModel(configFileLanguageModel, stopWordsFileName, languageModelSerializationFile, linksDBFile, []) self.configFileFeaturesExtractor = os.path.join( 'FeaturesExtractor', 'Configurations', 'Configurations_questions.xml') self.trainFeaturesSerializationFile = os.path.join( 'FeaturesExtractor', 'Output', 'train_features.bin') self.trainLabelsSerializationFile = os.path.join( 'FeaturesExtractor', 'Output', 'train_labels.bin') def get_data(self): configFileDatasetBuilder = os.path.join('DatasetBuilder', 'Configurations', 'Configurations.xml') datasetSerializationFile = os.path.join('DatasetBuilder', 'Output', 'dataset.bin') self.datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) dataset = self.datasetBuilder.getQuestionsDataset(self.dataset_path) self.languageModel.dataset = dataset self.languageModel.totalNumberOfDocs = len(dataset) self.languageModel.BuildLanguageModel() self.languageModel.dataset = [] return dataset def prepare_data(self, dataset): trainFeaturesExtractor = FeaturesExtractor( self.configFileFeaturesExtractor, self.trainFeaturesSerializationFile, self.trainLabelsSerializationFile, self.languageModel, dataset, questions_features=True) print("Data length: ", len(dataset)) words_dict = self.datasetBuilder.getQuestionsDatasetDictionary( self.words_dict_path) trainFeaturesExtractor.ExtractNumTfFeatures(questions_dict=words_dict) maxid = max([max(i.keys()) for i in trainFeaturesExtractor.features]) X = [] Y = [] L = len(dataset) for i, item in enumerate(trainFeaturesExtractor.features): itemx = [0 for _ in range(maxid)] l = [0, 0, 0] l[trainFeaturesExtractor.labels[i] - 1] = 1 for j in trainFeaturesExtractor.features[i]: v = trainFeaturesExtractor.features[i][j] itemx[j - 1] = v X.append(itemx) Y.append(trainFeaturesExtractor.labels[i]) return X, Y, L def transform_data(self, X, Y): X = np.array(X) Y = np.array(Y) ri = range(X.shape[0]) rl = range(X.shape[1]) d = pd.DataFrame(X, index=ri, columns=rl) d['class'] = Y return d def split_data(self, d): training_indices, testing_indices = train_test_split( d.index, stratify=d['class'].values, train_size=0.75, test_size=0.25) return training_indices, testing_indices def train(self): rawdata = self.get_data() X, Y, L = self.prepare_data(rawdata) ret = [0, 0] ret[0] = L d = self.transform_data(X, Y) self.training_indices, self.testing_indices = self.split_data(d) X = d.loc[self.training_indices].drop('class', axis=1).values Y = d.loc[self.training_indices, 'class'].values Xtest = d.loc[self.testing_indices].drop('class', axis=1).values Ytest = d.loc[self.testing_indices, 'class'].values if self.modeln == 1: print(self.fit_model1(X, Y)) ret[1] = self.evaluate_model1(Xtest, Ytest) print(ret[1]) if self.modeln == 2: print(self.fit_model2(X, Y)) ret[1] = self.evaluate_model2(Xtest, Ytest) print(ret[1]) if self.modeln == 3: print(self.fit_model3(X, Y)) ret[1] = self.evaluate_model2(Xtest, Ytest) print(ret[1]) return ret @classmethod def load(cls, path): return pickle.load(open(path, 'rb')) def save(self, path): return pickle.dump(self, open(path, 'wb')) def fit_model1(self, X, Y): self.model1 = LinearSVC(C=0.01, penalty="l1", dual=False, random_state=42) self.model1.fit(X, Y) recall = self.model1.score(X, Y) return recall def evaluate_model1(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def fit_model2(self, X, Y): self.model1 = LinearSVC(C=0.18, penalty="l1", dual=False, random_state=42) self.model1.fit(X, Y) recall = self.model1.score(X, Y) return recall def evaluate_model2(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def fit_model3(self, X, Y): pre_recall = 0.0 for g in [0.01, 0.05, 0.1, 0.3, 0.5]: model = SVC(C=0.18, gamma=g, random_state=42) model.fit(X, Y) recall = model.score(X, Y) print(recall) if recall > pre_recall: pre_recall = recall self.model1 = model return recall def evaluate_model3(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def isQuestion(self, opinion): dataset = [{'text': opinion.text, 'label': 'negativeq'}] X, Y, L = self.prepare_data(dataset) return self.model1.predict(X)[0] def addQuestion(self, opinion): q = models.QuestionOpinion() q.tweet = opinion q.since_id = opinion.twitter_id q.save() return q def checkQuestion(self, twitter, q): s = twitter.search(q='@' + q.tweet.tweeter.tweeter_name, count='500', result_type='mixed', since_id=q.since_id) replies = [] for tw in s['statuses']: if tw['in_reply_to_status_id_str'] == str(q.tweet.twitter_id): replies.append(tw) diffdate = datetime.now() - q.date_created.replace(tzinfo=None) if diffdate.days > MAX_DAYS: q.delete() else: q.since_id = s['statuses'][0]['in_reply_to_status_id_str'] q.save() return {"replies": replies, 'found': s['statuses']} def checkQuestions(self, twitter): for q in models.QuestionOpinion.objects.filter(): self.checkQuestion(twitter, q)