Ejemplo n.º 1
0
    def validate(self, df, epoch, counter, losses, train_accuracy, print_every):
        LoggerHelper.info("Validation Started...")
        # Get validation loss
        val_losses = []
        self.model.eval()
        accuracy = 0
        result = np.asarray([])
        result_expected = np.asarray([])
        for x, y in self.reader.get_data(NewsDnnBaseDataReader.DictDataTerm["Validate"],
                                         NewsDnnBaseDataReader.DictDataType[
                                             self.config["options"]["network_type"]]):
            # get_batches(val_data, batch_size, seq_length):
            x, y = torch.from_numpy(x), torch.from_numpy(y)

            inputs, targets = x, y
            if self.model.can_use_gpu and self.config["networkConfig"]["useGPU"]:
                inputs, targets = inputs.cuda(), targets.cuda()

            output = self.model(inputs)
            val_loss = self.criterion(output, targets.long())

            val_losses.append(val_loss.item())
            accuracy += self.calculate_accuracy(output, targets)
            # Sum and divide total value !
            result = np.append(result, self.get_output(output))
            result_expected = np.append(result_expected, targets.numpy())
        self.model.train()  # reset to train mode after iterationg through validation data
        scores = self.calculate_scores(result_expected, result)
        return self.log_validate(df, epoch, counter, losses, val_losses, self.validate_count, scores)
Ejemplo n.º 2
0
    def test(self):
        LoggerHelper.info("Test Started...")
        self.timer.start()
        df = pandas.DataFrame(columns=['Accuracy', 'Test Accuracy', 'Mean Test Loss'])
        val_losses = []
        self.model.eval()
        counter = 0
        accuracy = 0
        result = np.asarray([])
        result_expected = np.asarray([])
        for x, y in self.reader.get_data(NewsDnnBaseDataReader.DictDataTerm["Test"],
                                         NewsDnnBaseDataReader.DictDataType[
                                             self.config["options"]["network_type"]]):
            counter += 1
            x, y = torch.from_numpy(x), torch.from_numpy(y)
            inputs, targets = x, y
            if self.model.can_use_gpu and self.config["networkConfig"]["useGPU"]:
                inputs, targets = inputs.cuda(), targets.cuda()

            output = self.model(inputs)
            val_loss = self.criterion(output, targets.long())
            val_losses.append(val_loss.item())
            accuracy += self.calculate_accuracy(output, targets)
            result = np.append(result, self.get_output(output))
            result_expected = np.append(result_expected, targets.numpy())
        scores = self.calculate_scores(result_expected, result)
        df = self.log_test(df, accuracy, self.test_count, val_losses, scores)
        Export.append_df_to_excel(df, self.current_date)
        self.timer.stop(time_for="Test")
Ejemplo n.º 3
0
 def log_validate_without_loss(self, df, epoch, counter, validate_count,
                               scores):
     LoggerHelper.info(
         "Epoch: {}/{}...".format(epoch + 1, self.epochs) +
         "Step: {}...".format(counter) +
         "Accuracy In Step: {:.4f}...".format(scores["accuracy"]) +
         "F1 : {:.4f}...".format(scores["f1"]) +
         "Precision : {:.4f}...".format(scores["precision"]) +
         "Recall : {:.4f}...".format(scores["recall"]) +
         "Hamming : {:.4f}...".format(scores["hamming"]) +
         "Jaccard : {:.4f}...".format(scores["jaccard"]) +
         "Val Count: {:.4f}...".format(validate_count))
     df = df.append(
         {
             'Epoch': "{}/{}".format(epoch + 1, self.epochs),
             'Step': counter,
             'Accuracy In Step': scores["accuracy"],
             'F1 In Step': scores["f1"],
             'Precision In Step': scores["precision"],
             'Recall In Step': scores["recall"],
             'Hamming In Step': scores["hamming"],
             'Jaccard In Step': scores["jaccard"]
         },
         ignore_index=True)
     return df
Ejemplo n.º 4
0
 def load_model(self, path):
     self.model = BertForSequenceClassification.from_pretrained(path)  # re-load
     self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')  # re-load
     LoggerHelper.info("**Model Info**"
                       + "\nbatch_size : " + str(self.reader.batch_size)
                       + "\nsequence_length : " + str(self.reader.sequence_length))
     LoggerHelper.info("Model loaded from disk")
Ejemplo n.º 5
0
 def log_validate(self, df, epoch, counter, loss, val_losses,
                  validate_count, scores):
     LoggerHelper.info(
         "Epoch: {}/{}...".format(epoch + 1, self.epochs) +
         "Step: {}...".format(counter) +
         "Loss: {:.4f}...".format(loss.item()) +
         "Accuracy In Step: {:.4f}...".format(scores["accuracy"]) +
         "F1 : {:.4f}...".format(scores["f1"]) +
         "Precision : {:.4f}...".format(scores["precision"]) +
         "Recall : {:.4f}...".format(scores["recall"]) +
         "Hamming : {:.4f}...".format(scores["hamming"]) +
         "Jaccard : {:.4f}...".format(scores["jaccard"]) +
         "Val Count: {:.4f}...".format(validate_count) +
         "Val Loss: {:.4f}".format(np.mean(val_losses)))
     df = df.append(
         {
             'Epoch': "{}/{}".format(epoch + 1, self.epochs),
             'Step': counter,
             'Last Train Loss': loss.item(),
             'Mean Test Loss': np.mean(val_losses),
             'Accuracy In Step': scores["accuracy"],
             'F1 In Step': scores["f1"],
             'Precision In Step': scores["precision"],
             'Recall In Step': scores["recall"],
             'Hamming In Step': scores["hamming"],
             'Jaccard In Step': scores["jaccard"]
         },
         ignore_index=True)
     return df
Ejemplo n.º 6
0
 def evaluate(self):
     LoggerHelper.info("Evaluation Started...")
     nlp = pipeline('sentiment-analysis')
     self.load_model(self.config["evaluation"]["load"])
     self.model.eval()
     self.timer.start()
     db = Mongo()
     news_collection = db.create_collection(self.config["evaluation"]["collection"])
     news_filtered = db.create_collection(self.config["evaluation"]["destination"], NewsOrganizer.get_index_models())
     count = 0
     processed = 0
     while True:
         try:
             cursor = news_collection.find(self.config["evaluation"]["query"], no_cursor_timeout=True).skip(
                 processed)
             for news in cursor:
                 try:
                     summery = news.get('summery')
                     b_input_ids, b_input_mask = self.reader.get_one_news(summery)
                     b_input_ids, b_input_mask = b_input_ids.to(self.device), b_input_mask.to(self.device)
                     outputs = self.model(b_input_ids, token_type_ids=None,
                                          attention_mask=b_input_mask)
                     logits = outputs[0].detach().cpu().numpy()  # Move result to CPU
                     result = np.argmax(logits, axis=1).flatten()  #
                     sentiment = nlp(summery)
                     if result[0] == 1:
                         news_filtered.insert({
                             "_id": news.get('_id'),
                             "title": news.get('title'),
                             "summery": news.get('summery'),
                             "article": news.get('article'),
                             "url": news.get('url'),
                             "category": news.get('category'),
                             "price_after_minute": news.get('price_after_minute'),
                             "price_after_hour": news.get('price_after_hour'),
                             "price_after_day": news.get('price_after_day'),
                             "sentiment": sentiment,
                             "price_before": news.get('price_before'),
                             "wiki_relatedness": news.get('wiki_relatedness'),
                             "tweet_count": news.get('tweet_count'),
                             "tweet_percentage": news.get('tweet_percentage'),
                             "date": news.get('date'),
                             "authors": news.get('authors'),
                             "comment": news.get('comment'),
                             "price_effect": news.get('price_effect')
                         })
                 except Exception as exception:
                     Logger().get_logger().error(type(exception).__name__, exc_info=True)
                     traceback.print_exc()
                 count = count + 1
                 if count % 500 == 0:
                     print(count)
                 processed += 1
             cursor.close()
             break
         except CursorNotFound:
             processed += 1
             print("Lost cursor. Retry with skip")
     self.timer.stop(time_for="Evaluation")
Ejemplo n.º 7
0
def load_config():
    LoggerHelper.info("Loading Config...")
    pwd = os.path.dirname(os.path.abspath(__file__))
    if platform.system() == "Windows":
        Config.add_config_ini('%s\\initialization\\main_w.ini' % pwd)
    else:
        Config.add_config_ini('%s/initialization/main.ini' % pwd)
    LoggerHelper.info("Loading is loaded.")
Ejemplo n.º 8
0
 def get_network_input_size(self):
     size = self.config["wordEmbedding"]["size"]
     if self.config["options"]["wiki"]["enabled"]:
         size = size + self.config["options"]["wiki"]["multiply_factors"]
     if self.config["options"]["twitter"]["enabled"]:
         size = size + self.config["options"]["twitter"]["multiply_factors"]
     LoggerHelper.info("Network Input Size :" + str(size))
     return size
Ejemplo n.º 9
0
 def calculate_hidden_size(self):
     samples_in_training_data = 116100
     scaling_factor = 5
     input_neurons = self.input_size
     output_neurons = self.output_size
     size = int(samples_in_training_data / (scaling_factor * (input_neurons + output_neurons)))
     LoggerHelper.info('Calculated hidden size is ' + str(size))
     if size == 0:
         LoggerHelper.error('Calculated hidden size is changed to 2')
         return 2
     else:
         return size
Ejemplo n.º 10
0
    def save_model(self):
        # serialize model to JSON
        save_file_name = self.get_save_file_name()
        checkpoint = {
            'model': NewsDnnGeneralModel(),
            'model_state_dict': self.model.state_dict(),
            'optimizer': optim.Adam(self.model.parameters(), lr=self.model.lr),
            'optimizer_state_dict': self.optimizer.state_dict()
        }

        torch.save(checkpoint, save_file_name)
        LoggerHelper.info("Model Saved to disk")
Ejemplo n.º 11
0
 def load_model(self, path):
     checkpoint = torch.load(path)
     self.model = checkpoint['model']
     self.model.load_state_dict(checkpoint['model_state_dict'])
     self.optimizer = checkpoint['optimizer']
     self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
     LoggerHelper.info("**Model Info**"
                       + "\nbatch_size : " + str(self.reader.batch_size)
                       + "\nsequence_length : " + str(self.reader.sequence_length)
                       + "\ninput_size : " + str(self.model.input_size)
                       + "\nnum_layers : " + str(self.model.num_layers)
                       + "\ndrop_prob : " + str(self.model.drop_prob)
                       + "\nlr : " + str(self.model.lr))
     LoggerHelper.info("Model loaded from disk")
Ejemplo n.º 12
0
    def test(self):
        LoggerHelper.info("Test Started...")
        self.timer.start()
        df = pandas.DataFrame(columns=['Accuracy', 'Test Accuracy', 'Mean Test Loss'])
        # Tracking variables
        val_losses = []
        predictions, true_labels = [], []

        test_set = self.reader.get_data(NewsCateDataReader.DictDataTerm["Test"],
                                        NewsCateDataReader.DictDataType[
                                            self.config["options"]["network_type"]])
        self.model.eval()
        accuracy = 0
        for batch in test_set:
            # Add batch to GPU
            batch = tuple(t.to(self.device) for t in batch)

            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels = batch

            with torch.no_grad():
                # Forward pass, calculate logit predictions
                outputs = self.model(b_input_ids, token_type_ids=None,
                                     attention_mask=b_input_mask)

                logits = outputs[0]

                # Move logits and labels to CPU
                logits = logits.detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()

                # Calculate the accuracy for this batch of test sentences.
                label, acc = self.calculate_accuracy(logits, label_ids)
                accuracy += acc

                # Store predictions and true labels
                predictions.append(label)
                true_labels.append(label_ids)
        scores = self.calculate_scores(predictions, true_labels)
        df = self.log_test(df, accuracy, self.test_count, val_losses, scores)
        Export.append_df_to_excel(df, self.current_date)
        self.timer.stop(time_for="Test")
Ejemplo n.º 13
0
 def validate(self, df, epoch, losses):
     LoggerHelper.info("Validation Started...")
     # Get validation loss
     val_losses = []
     predictions, true_labels = [], []
     self.model.eval()
     accuracy = 0
     steps = 0
     validate_set = self.reader.get_data(NewsCateDataReader.DictDataTerm["Validate"],
                                         NewsCateDataReader.DictDataType[
                                             self.config["options"]["network_type"]])
     for batch in validate_set:  # Evaluate data for one epoch
         # Add batch to GPU
         batch = tuple(t.to(self.device) for t in batch)
         # Unpack the inputs from our dataloader
         b_input_ids, b_input_mask, b_labels = batch
         with torch.no_grad():  # Not to compute or store gradients
             outputs = self.model(b_input_ids,
                                  token_type_ids=None,
                                  attention_mask=b_input_mask)
             logits = outputs[0]
             # Move logits and labels to CPU
             logits = logits.detach().cpu().numpy()
             label_ids = b_labels.to('cpu').numpy()
             # Calculate the accuracy for this batch of test sentences.
             label, tmp_eval_accuracy = self.calculate_accuracy(logits, label_ids)
             # Accumulate the total accuracy.
             accuracy += tmp_eval_accuracy
             # Track the number of batches
             steps += 1
             # Store predictions and true labels
             predictions.append(label)
             true_labels.append(label_ids)
     # Report the final accuracy for this validation run.
     LoggerHelper.info("Accuracy: {0:.2f}".format(accuracy / steps))
     scores = self.calculate_scores(predictions, true_labels)
     self.model.train()  # reset to train mode after iterationg through validation data
     return self.log_validate_without_loss(df, epoch, 0, self.validate_count, scores)
Ejemplo n.º 14
0
    def __init__(self,
                 input_size=1,
                 hidden=None,
                 n_layers=2,
                 drop_prob=0.2,
                 lr=0.001,
                 training_data_size=100000,
                 output_size=3,
                 use_gpu=True):
        super().__init__()
        self.should_use_gpu = use_gpu
        self.input_size = input_size
        self.training_data_size = training_data_size
        self.num_layers = n_layers
        self.drop_prob = drop_prob
        self.lr = lr
        self.output_size = output_size
        if hidden is None:
            self.hidden = self.calculate_hidden_size()
        else:
            self.hidden = hidden

        self.lstm = nn.LSTM(
            self.input_size,  # Expected features in the input
            self.hidden,  # Features in the hidden state
            self.num_layers,  # Stacked LSTM's
            bias=True,  # Bias weights should be used or not
            dropout=drop_prob,  # Dropout layer of each LSTM
            batch_first=
            True,  # Input and output tensors are provided as (batch, seq, feature)
            bidirectional=False)  # Bidirectional LSTM

        # Additional Dropout Layer
        self.dropout = nn.Dropout(drop_prob)

        # Fully-Connected Output Layer
        self.fc = nn.Linear(self.hidden, output_size)

        # Sigmoid Layer
        self.sig = nn.LogSoftmax(dim=1)

        # Check GPU Usage
        self.can_use_gpu = torch.cuda.is_available()
        if self.can_use_gpu:
            if self.should_use_gpu:
                LoggerHelper.info('Training on GPU!')
            else:
                LoggerHelper.info('GPU usage is disabled by config.json')
        else:
            LoggerHelper.info(
                'No GPU available, training on CPU; consider making n_epochs very small.'
            )
Ejemplo n.º 15
0
    def __init__(self,
                 input_size=102,
                 n_layers=2,
                 drop_prob=0.2,
                 n_filters=100,
                 filter_sizes=[3, 4, 5],
                 lr=0.001,
                 output_size=3,
                 use_gpu=True):
        super().__init__()
        self.should_use_gpu = use_gpu
        self.input_size = input_size
        self.output_size = output_size
        self.num_layers = n_layers
        self.drop_prob = drop_prob
        self.lr = lr

        # 2D Convolution Layer
        self.conv_0 = nn.Conv1d(in_channels=input_size,
                                out_channels=n_filters,
                                kernel_size=filter_sizes[0])

        self.conv_1 = nn.Conv1d(in_channels=input_size,
                                out_channels=n_filters,
                                kernel_size=filter_sizes[1])

        self.conv_2 = nn.Conv1d(in_channels=input_size,
                                out_channels=n_filters,
                                kernel_size=filter_sizes[2])

        # Additional Dropout Layer
        self.dropout = nn.Dropout(drop_prob)
        # Fully-Connected Output Layer
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_size)
        # Sigmoid Layer
        self.sig = nn.Softmax(dim=1)
        # Check GPU Usage
        self.can_use_gpu = torch.cuda.is_available()
        if self.can_use_gpu:
            if self.should_use_gpu:
                LoggerHelper.info('Training on GPU!')
            else:
                LoggerHelper.info('GPU usage is disabled by config.json')
        else:
            LoggerHelper.info('No GPU available, training on CPU; consider making n_epochs very small.')
Ejemplo n.º 16
0
 def stop(self, time_for=None):
     end_dt = dt.datetime.now()
     if time_for is None:
         LoggerHelper.info('Time taken: %s' % (end_dt - self.start_dt))
     else:
         LoggerHelper.info('Time taken for ' + time_for + ' : %s' % (end_dt - self.start_dt))
Ejemplo n.º 17
0
 def get_network_input_size(self):
     size = 1
     LoggerHelper.info("Network Input Size :" + str(size))
     return size
Ejemplo n.º 18
0
def main():
    # Load Config
    load_config()
    # Load arg
    args = load_arg()

    if args.fdc:
        LoggerHelper.info("Starting Financial Data Collector Mode...")
        fdc = FDC()
        fdc.collect()
        LoggerHelper.info("Financial Data Collector Mode is ended.")

    if args.wiki:
        LoggerHelper.info("Starting Wikipedia Load Mode...")
        wiki = WikiRecorder()
        wiki.collect_all()
        LoggerHelper.info("Wikipedia Load Mode is ended.")

    if args.organize:
        LoggerHelper.info("Starting News Organizer Mode...")
        collector = NewsOrganizer()
        collector.dnn_organizer_for_dnn_filtered_news()
        LoggerHelper.info("News Organizer Mode is ended.")

    if args.ind:
        LoggerHelper.info("Starting Indicators Collector Mode...")
        ind_collector = IndicatorsCollector()
        if args.ind == "zip":
            ind_collector.collect_from_zip()
        else:
            ind_collector.collect()
        LoggerHelper.info("Indicators Collector Mode is ended.")

    if args.news is not None:
        LoggerHelper.info("Starting Stock Prediction Mode...")
        news_dnn = get_news_type(args.news)
        news_dnn.train(print_every=int(Config.training.print_every))
        news_dnn.test()
        LoggerHelper.info("News Stock Prediction is ended.")
        # WordEmbedding(path=Config.word_embedding.path)
        # news_dnn = NewsDnnMain(epochs=int(Config.training.epochs),
                                # batch_size=int(Config.training.batch_size),
                                # seq_length=int(Config.training.sequence_length),
                                # lr=float(Config.training.lr))3
    if args.statistics:
        LoggerHelper.info("Starting Statistic Collection Mode...")
        Statistics().collect()
        LoggerHelper.info("Statistic Collection is ended...")

    if args.test:
        LoggerHelper.info("Starting Test Mode...")
        TransformersTest.sentiment_analysis_test()
        LoggerHelper.info("Test Mode is ended...")

    if args.webservice:
        web_manager = WebManager()
        web_manager.add_static_files()
        web_manager.add_news_root()
        web_manager.run()
Ejemplo n.º 19
0
    def train(self, print_every=20):
        df = pandas.DataFrame(columns=['Epoch', 'Step',
                                       'Train Mean Loss Cumulative', 'Train Accuracy',
                                       'Val Mean Loss', 'Val Accuracy'])
        self.timer.start()
        self.model.train()  # Set mode of model
        losses = []
        train_set = self.reader.get_data(fetch_type=NewsCateDataReader.DictDataTerm["Train"],
                                         data_type=NewsCateDataReader.DictDataType[
                                             self.config["options"]["network_type"]])
        for e in range(self.epochs):
            print(self.config["options"]["network_type"])
            print(NewsCateDataReader.DictDataType[
                      self.config["options"]["network_type"]])
            self.model.train()  # Set to Train Mode
            total_loss_for_epoch = 0

            epoch_timer = Timer()
            epoch_timer.start()
            for step, batch in enumerate(train_set): # For each batch of training data...
                # Progress update every 40 batches.
                if step % print_every == 0:
                    # Report progress.
                    print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_set)))
                # Get Data
                b_input_ids = batch[0].to(self.device)
                b_input_mask = batch[1].to(self.device)
                b_labels = batch[2].to(self.device)

                # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
                self.model.zero_grad()

                # Perform a forward pass (evaluate the model on this training batch).
                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                outputs = self.model(b_input_ids,
                                     token_type_ids=None,
                                     attention_mask=b_input_mask,
                                     labels=b_labels)
                loss = outputs[0]
                total_loss_for_epoch += loss.item()

                # Perform a backward pass to calculate the gradients.
                loss.backward()

                # This is to help prevent the "exploding gradients" problem.
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)

                # modified based on their gradients, the learning rate, etc.
                self.optimizer.step()

                # Update the learning rate.
                self.scheduler.step()

            # Calculate the average loss over the training data.
            avg_train_loss = total_loss_for_epoch / len(train_set)

            # Store the loss value for plotting the learning curve.
            losses.append(avg_train_loss)
            LoggerHelper.info("  Average training loss: {0:.2f}".format(avg_train_loss))
            epoch_timer.stop(time_for="Epoch")

            timer = Timer(start=True)
            df = self.validate(df, e, losses)
            timer.stop(time_for="Validate")
            self.model.train()
        self.timer.stop(time_for="Train")
        self.save_model()
        self.current_date = DateHelper.get_current_date()
        Export.append_df_to_excel(df, self.current_date)
        Export.append_df_to_excel(self.get_info(), self.current_date)
Ejemplo n.º 20
0
 def save_model(self):
     save_file_name = self.get_save_file_name()
     FileHelper.create_path_if_not_exists(save_file_name)
     self.model.save_pretrained(save_file_name)  # save
     self.reader.tokenizer.save_pretrained(save_file_name)  # save
     LoggerHelper.info("Model Saved to disk")