def validate(self, df, epoch, counter, losses, train_accuracy, print_every): LoggerHelper.info("Validation Started...") # Get validation loss val_losses = [] self.model.eval() accuracy = 0 result = np.asarray([]) result_expected = np.asarray([]) for x, y in self.reader.get_data(NewsDnnBaseDataReader.DictDataTerm["Validate"], NewsDnnBaseDataReader.DictDataType[ self.config["options"]["network_type"]]): # get_batches(val_data, batch_size, seq_length): x, y = torch.from_numpy(x), torch.from_numpy(y) inputs, targets = x, y if self.model.can_use_gpu and self.config["networkConfig"]["useGPU"]: inputs, targets = inputs.cuda(), targets.cuda() output = self.model(inputs) val_loss = self.criterion(output, targets.long()) val_losses.append(val_loss.item()) accuracy += self.calculate_accuracy(output, targets) # Sum and divide total value ! result = np.append(result, self.get_output(output)) result_expected = np.append(result_expected, targets.numpy()) self.model.train() # reset to train mode after iterationg through validation data scores = self.calculate_scores(result_expected, result) return self.log_validate(df, epoch, counter, losses, val_losses, self.validate_count, scores)
def test(self): LoggerHelper.info("Test Started...") self.timer.start() df = pandas.DataFrame(columns=['Accuracy', 'Test Accuracy', 'Mean Test Loss']) val_losses = [] self.model.eval() counter = 0 accuracy = 0 result = np.asarray([]) result_expected = np.asarray([]) for x, y in self.reader.get_data(NewsDnnBaseDataReader.DictDataTerm["Test"], NewsDnnBaseDataReader.DictDataType[ self.config["options"]["network_type"]]): counter += 1 x, y = torch.from_numpy(x), torch.from_numpy(y) inputs, targets = x, y if self.model.can_use_gpu and self.config["networkConfig"]["useGPU"]: inputs, targets = inputs.cuda(), targets.cuda() output = self.model(inputs) val_loss = self.criterion(output, targets.long()) val_losses.append(val_loss.item()) accuracy += self.calculate_accuracy(output, targets) result = np.append(result, self.get_output(output)) result_expected = np.append(result_expected, targets.numpy()) scores = self.calculate_scores(result_expected, result) df = self.log_test(df, accuracy, self.test_count, val_losses, scores) Export.append_df_to_excel(df, self.current_date) self.timer.stop(time_for="Test")
def log_validate_without_loss(self, df, epoch, counter, validate_count, scores): LoggerHelper.info( "Epoch: {}/{}...".format(epoch + 1, self.epochs) + "Step: {}...".format(counter) + "Accuracy In Step: {:.4f}...".format(scores["accuracy"]) + "F1 : {:.4f}...".format(scores["f1"]) + "Precision : {:.4f}...".format(scores["precision"]) + "Recall : {:.4f}...".format(scores["recall"]) + "Hamming : {:.4f}...".format(scores["hamming"]) + "Jaccard : {:.4f}...".format(scores["jaccard"]) + "Val Count: {:.4f}...".format(validate_count)) df = df.append( { 'Epoch': "{}/{}".format(epoch + 1, self.epochs), 'Step': counter, 'Accuracy In Step': scores["accuracy"], 'F1 In Step': scores["f1"], 'Precision In Step': scores["precision"], 'Recall In Step': scores["recall"], 'Hamming In Step': scores["hamming"], 'Jaccard In Step': scores["jaccard"] }, ignore_index=True) return df
def load_model(self, path): self.model = BertForSequenceClassification.from_pretrained(path) # re-load self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # re-load LoggerHelper.info("**Model Info**" + "\nbatch_size : " + str(self.reader.batch_size) + "\nsequence_length : " + str(self.reader.sequence_length)) LoggerHelper.info("Model loaded from disk")
def log_validate(self, df, epoch, counter, loss, val_losses, validate_count, scores): LoggerHelper.info( "Epoch: {}/{}...".format(epoch + 1, self.epochs) + "Step: {}...".format(counter) + "Loss: {:.4f}...".format(loss.item()) + "Accuracy In Step: {:.4f}...".format(scores["accuracy"]) + "F1 : {:.4f}...".format(scores["f1"]) + "Precision : {:.4f}...".format(scores["precision"]) + "Recall : {:.4f}...".format(scores["recall"]) + "Hamming : {:.4f}...".format(scores["hamming"]) + "Jaccard : {:.4f}...".format(scores["jaccard"]) + "Val Count: {:.4f}...".format(validate_count) + "Val Loss: {:.4f}".format(np.mean(val_losses))) df = df.append( { 'Epoch': "{}/{}".format(epoch + 1, self.epochs), 'Step': counter, 'Last Train Loss': loss.item(), 'Mean Test Loss': np.mean(val_losses), 'Accuracy In Step': scores["accuracy"], 'F1 In Step': scores["f1"], 'Precision In Step': scores["precision"], 'Recall In Step': scores["recall"], 'Hamming In Step': scores["hamming"], 'Jaccard In Step': scores["jaccard"] }, ignore_index=True) return df
def evaluate(self): LoggerHelper.info("Evaluation Started...") nlp = pipeline('sentiment-analysis') self.load_model(self.config["evaluation"]["load"]) self.model.eval() self.timer.start() db = Mongo() news_collection = db.create_collection(self.config["evaluation"]["collection"]) news_filtered = db.create_collection(self.config["evaluation"]["destination"], NewsOrganizer.get_index_models()) count = 0 processed = 0 while True: try: cursor = news_collection.find(self.config["evaluation"]["query"], no_cursor_timeout=True).skip( processed) for news in cursor: try: summery = news.get('summery') b_input_ids, b_input_mask = self.reader.get_one_news(summery) b_input_ids, b_input_mask = b_input_ids.to(self.device), b_input_mask.to(self.device) outputs = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = outputs[0].detach().cpu().numpy() # Move result to CPU result = np.argmax(logits, axis=1).flatten() # sentiment = nlp(summery) if result[0] == 1: news_filtered.insert({ "_id": news.get('_id'), "title": news.get('title'), "summery": news.get('summery'), "article": news.get('article'), "url": news.get('url'), "category": news.get('category'), "price_after_minute": news.get('price_after_minute'), "price_after_hour": news.get('price_after_hour'), "price_after_day": news.get('price_after_day'), "sentiment": sentiment, "price_before": news.get('price_before'), "wiki_relatedness": news.get('wiki_relatedness'), "tweet_count": news.get('tweet_count'), "tweet_percentage": news.get('tweet_percentage'), "date": news.get('date'), "authors": news.get('authors'), "comment": news.get('comment'), "price_effect": news.get('price_effect') }) except Exception as exception: Logger().get_logger().error(type(exception).__name__, exc_info=True) traceback.print_exc() count = count + 1 if count % 500 == 0: print(count) processed += 1 cursor.close() break except CursorNotFound: processed += 1 print("Lost cursor. Retry with skip") self.timer.stop(time_for="Evaluation")
def load_config(): LoggerHelper.info("Loading Config...") pwd = os.path.dirname(os.path.abspath(__file__)) if platform.system() == "Windows": Config.add_config_ini('%s\\initialization\\main_w.ini' % pwd) else: Config.add_config_ini('%s/initialization/main.ini' % pwd) LoggerHelper.info("Loading is loaded.")
def get_network_input_size(self): size = self.config["wordEmbedding"]["size"] if self.config["options"]["wiki"]["enabled"]: size = size + self.config["options"]["wiki"]["multiply_factors"] if self.config["options"]["twitter"]["enabled"]: size = size + self.config["options"]["twitter"]["multiply_factors"] LoggerHelper.info("Network Input Size :" + str(size)) return size
def calculate_hidden_size(self): samples_in_training_data = 116100 scaling_factor = 5 input_neurons = self.input_size output_neurons = self.output_size size = int(samples_in_training_data / (scaling_factor * (input_neurons + output_neurons))) LoggerHelper.info('Calculated hidden size is ' + str(size)) if size == 0: LoggerHelper.error('Calculated hidden size is changed to 2') return 2 else: return size
def save_model(self): # serialize model to JSON save_file_name = self.get_save_file_name() checkpoint = { 'model': NewsDnnGeneralModel(), 'model_state_dict': self.model.state_dict(), 'optimizer': optim.Adam(self.model.parameters(), lr=self.model.lr), 'optimizer_state_dict': self.optimizer.state_dict() } torch.save(checkpoint, save_file_name) LoggerHelper.info("Model Saved to disk")
def load_model(self, path): checkpoint = torch.load(path) self.model = checkpoint['model'] self.model.load_state_dict(checkpoint['model_state_dict']) self.optimizer = checkpoint['optimizer'] self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) LoggerHelper.info("**Model Info**" + "\nbatch_size : " + str(self.reader.batch_size) + "\nsequence_length : " + str(self.reader.sequence_length) + "\ninput_size : " + str(self.model.input_size) + "\nnum_layers : " + str(self.model.num_layers) + "\ndrop_prob : " + str(self.model.drop_prob) + "\nlr : " + str(self.model.lr)) LoggerHelper.info("Model loaded from disk")
def test(self): LoggerHelper.info("Test Started...") self.timer.start() df = pandas.DataFrame(columns=['Accuracy', 'Test Accuracy', 'Mean Test Loss']) # Tracking variables val_losses = [] predictions, true_labels = [], [] test_set = self.reader.get_data(NewsCateDataReader.DictDataTerm["Test"], NewsCateDataReader.DictDataType[ self.config["options"]["network_type"]]) self.model.eval() accuracy = 0 for batch in test_set: # Add batch to GPU batch = tuple(t.to(self.device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): # Forward pass, calculate logit predictions outputs = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = outputs[0] # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() # Calculate the accuracy for this batch of test sentences. label, acc = self.calculate_accuracy(logits, label_ids) accuracy += acc # Store predictions and true labels predictions.append(label) true_labels.append(label_ids) scores = self.calculate_scores(predictions, true_labels) df = self.log_test(df, accuracy, self.test_count, val_losses, scores) Export.append_df_to_excel(df, self.current_date) self.timer.stop(time_for="Test")
def validate(self, df, epoch, losses): LoggerHelper.info("Validation Started...") # Get validation loss val_losses = [] predictions, true_labels = [], [] self.model.eval() accuracy = 0 steps = 0 validate_set = self.reader.get_data(NewsCateDataReader.DictDataTerm["Validate"], NewsCateDataReader.DictDataType[ self.config["options"]["network_type"]]) for batch in validate_set: # Evaluate data for one epoch # Add batch to GPU batch = tuple(t.to(self.device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): # Not to compute or store gradients outputs = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = outputs[0] # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() # Calculate the accuracy for this batch of test sentences. label, tmp_eval_accuracy = self.calculate_accuracy(logits, label_ids) # Accumulate the total accuracy. accuracy += tmp_eval_accuracy # Track the number of batches steps += 1 # Store predictions and true labels predictions.append(label) true_labels.append(label_ids) # Report the final accuracy for this validation run. LoggerHelper.info("Accuracy: {0:.2f}".format(accuracy / steps)) scores = self.calculate_scores(predictions, true_labels) self.model.train() # reset to train mode after iterationg through validation data return self.log_validate_without_loss(df, epoch, 0, self.validate_count, scores)
def __init__(self, input_size=1, hidden=None, n_layers=2, drop_prob=0.2, lr=0.001, training_data_size=100000, output_size=3, use_gpu=True): super().__init__() self.should_use_gpu = use_gpu self.input_size = input_size self.training_data_size = training_data_size self.num_layers = n_layers self.drop_prob = drop_prob self.lr = lr self.output_size = output_size if hidden is None: self.hidden = self.calculate_hidden_size() else: self.hidden = hidden self.lstm = nn.LSTM( self.input_size, # Expected features in the input self.hidden, # Features in the hidden state self.num_layers, # Stacked LSTM's bias=True, # Bias weights should be used or not dropout=drop_prob, # Dropout layer of each LSTM batch_first= True, # Input and output tensors are provided as (batch, seq, feature) bidirectional=False) # Bidirectional LSTM # Additional Dropout Layer self.dropout = nn.Dropout(drop_prob) # Fully-Connected Output Layer self.fc = nn.Linear(self.hidden, output_size) # Sigmoid Layer self.sig = nn.LogSoftmax(dim=1) # Check GPU Usage self.can_use_gpu = torch.cuda.is_available() if self.can_use_gpu: if self.should_use_gpu: LoggerHelper.info('Training on GPU!') else: LoggerHelper.info('GPU usage is disabled by config.json') else: LoggerHelper.info( 'No GPU available, training on CPU; consider making n_epochs very small.' )
def __init__(self, input_size=102, n_layers=2, drop_prob=0.2, n_filters=100, filter_sizes=[3, 4, 5], lr=0.001, output_size=3, use_gpu=True): super().__init__() self.should_use_gpu = use_gpu self.input_size = input_size self.output_size = output_size self.num_layers = n_layers self.drop_prob = drop_prob self.lr = lr # 2D Convolution Layer self.conv_0 = nn.Conv1d(in_channels=input_size, out_channels=n_filters, kernel_size=filter_sizes[0]) self.conv_1 = nn.Conv1d(in_channels=input_size, out_channels=n_filters, kernel_size=filter_sizes[1]) self.conv_2 = nn.Conv1d(in_channels=input_size, out_channels=n_filters, kernel_size=filter_sizes[2]) # Additional Dropout Layer self.dropout = nn.Dropout(drop_prob) # Fully-Connected Output Layer self.fc = nn.Linear(len(filter_sizes) * n_filters, output_size) # Sigmoid Layer self.sig = nn.Softmax(dim=1) # Check GPU Usage self.can_use_gpu = torch.cuda.is_available() if self.can_use_gpu: if self.should_use_gpu: LoggerHelper.info('Training on GPU!') else: LoggerHelper.info('GPU usage is disabled by config.json') else: LoggerHelper.info('No GPU available, training on CPU; consider making n_epochs very small.')
def stop(self, time_for=None): end_dt = dt.datetime.now() if time_for is None: LoggerHelper.info('Time taken: %s' % (end_dt - self.start_dt)) else: LoggerHelper.info('Time taken for ' + time_for + ' : %s' % (end_dt - self.start_dt))
def get_network_input_size(self): size = 1 LoggerHelper.info("Network Input Size :" + str(size)) return size
def main(): # Load Config load_config() # Load arg args = load_arg() if args.fdc: LoggerHelper.info("Starting Financial Data Collector Mode...") fdc = FDC() fdc.collect() LoggerHelper.info("Financial Data Collector Mode is ended.") if args.wiki: LoggerHelper.info("Starting Wikipedia Load Mode...") wiki = WikiRecorder() wiki.collect_all() LoggerHelper.info("Wikipedia Load Mode is ended.") if args.organize: LoggerHelper.info("Starting News Organizer Mode...") collector = NewsOrganizer() collector.dnn_organizer_for_dnn_filtered_news() LoggerHelper.info("News Organizer Mode is ended.") if args.ind: LoggerHelper.info("Starting Indicators Collector Mode...") ind_collector = IndicatorsCollector() if args.ind == "zip": ind_collector.collect_from_zip() else: ind_collector.collect() LoggerHelper.info("Indicators Collector Mode is ended.") if args.news is not None: LoggerHelper.info("Starting Stock Prediction Mode...") news_dnn = get_news_type(args.news) news_dnn.train(print_every=int(Config.training.print_every)) news_dnn.test() LoggerHelper.info("News Stock Prediction is ended.") # WordEmbedding(path=Config.word_embedding.path) # news_dnn = NewsDnnMain(epochs=int(Config.training.epochs), # batch_size=int(Config.training.batch_size), # seq_length=int(Config.training.sequence_length), # lr=float(Config.training.lr))3 if args.statistics: LoggerHelper.info("Starting Statistic Collection Mode...") Statistics().collect() LoggerHelper.info("Statistic Collection is ended...") if args.test: LoggerHelper.info("Starting Test Mode...") TransformersTest.sentiment_analysis_test() LoggerHelper.info("Test Mode is ended...") if args.webservice: web_manager = WebManager() web_manager.add_static_files() web_manager.add_news_root() web_manager.run()
def train(self, print_every=20): df = pandas.DataFrame(columns=['Epoch', 'Step', 'Train Mean Loss Cumulative', 'Train Accuracy', 'Val Mean Loss', 'Val Accuracy']) self.timer.start() self.model.train() # Set mode of model losses = [] train_set = self.reader.get_data(fetch_type=NewsCateDataReader.DictDataTerm["Train"], data_type=NewsCateDataReader.DictDataType[ self.config["options"]["network_type"]]) for e in range(self.epochs): print(self.config["options"]["network_type"]) print(NewsCateDataReader.DictDataType[ self.config["options"]["network_type"]]) self.model.train() # Set to Train Mode total_loss_for_epoch = 0 epoch_timer = Timer() epoch_timer.start() for step, batch in enumerate(train_set): # For each batch of training data... # Progress update every 40 batches. if step % print_every == 0: # Report progress. print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_set))) # Get Data b_input_ids = batch[0].to(self.device) b_input_mask = batch[1].to(self.device) b_labels = batch[2].to(self.device) # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) self.model.zero_grad() # Perform a forward pass (evaluate the model on this training batch). # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification outputs = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) loss = outputs[0] total_loss_for_epoch += loss.item() # Perform a backward pass to calculate the gradients. loss.backward() # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) # modified based on their gradients, the learning rate, etc. self.optimizer.step() # Update the learning rate. self.scheduler.step() # Calculate the average loss over the training data. avg_train_loss = total_loss_for_epoch / len(train_set) # Store the loss value for plotting the learning curve. losses.append(avg_train_loss) LoggerHelper.info(" Average training loss: {0:.2f}".format(avg_train_loss)) epoch_timer.stop(time_for="Epoch") timer = Timer(start=True) df = self.validate(df, e, losses) timer.stop(time_for="Validate") self.model.train() self.timer.stop(time_for="Train") self.save_model() self.current_date = DateHelper.get_current_date() Export.append_df_to_excel(df, self.current_date) Export.append_df_to_excel(self.get_info(), self.current_date)
def save_model(self): save_file_name = self.get_save_file_name() FileHelper.create_path_if_not_exists(save_file_name) self.model.save_pretrained(save_file_name) # save self.reader.tokenizer.save_pretrained(save_file_name) # save LoggerHelper.info("Model Saved to disk")