Beispiel #1
0
    def normalizing(self):
        """
        normalizing method is writen for normalizing tweets
        :return:
            normal_tweets: normal tweets
            emojis: emojis of normal tweets
        """
        # load tweets and emojis
        raw_tweets, raw_emojis = self.read_tweets()
        print("Start normalizing tweets ...")

        start_time = time.time()
        # normalizing tweets
        normal_tweets = [
            self.normalizer.normalizer_text(tweet) for tweet in raw_tweets
        ]
        end_time = time.time()

        # calculate normalizing time
        elapsed_mins, elapsed_secs = process_time(start_time, end_time)
        print(
            f"{elapsed_mins} min and {elapsed_secs} sec for normalizing tweets."
        )
        print("End normalizing tweets")
        return normal_tweets, raw_emojis
Beispiel #2
0
    def test_split(self, normal_tweets, normal_emojis):
        """
        test_split method is written for split data into train and test set
        :param normal_tweets: list of all tweets
        :param normal_emojis: list of all emojis
        """
        # shuffle tweets
        tweets_list = list(zip(normal_tweets, normal_emojis))
        random.shuffle(tweets_list)
        random.shuffle(tweets_list)
        normal_tweets, normal_emojis = zip(*tweets_list)

        test_tweet_list = []  # list for test tweets
        test_emoji_list = []  # list for test emojis
        train_tweet_list = []  # list for train tweets
        train_emoji_list = []  # list for train emojis

        # split test tweets
        start_time = time.time()

        for tweet, emoji in zip(normal_tweets, normal_emojis):
            # filter tweets that have no character
            if tweet != "":
                if test_emoji_list.count(emoji) < 2000:
                    test_tweet_list.append(tweet)
                    test_emoji_list.append(emoji)
                else:
                    train_tweet_list.append(tweet)
                    train_emoji_list.append(emoji)

        end_time = time.time()

        # calculate test split time
        elapsed_mins, elapsed_secs = process_time(start_time, end_time)
        print(
            f"{elapsed_mins} min and {elapsed_secs} sec for test split tweets."
        )

        # save data
        self.save_normal_tweets(train_tweet_list,
                                train_emoji_list,
                                output_path=self.train_output_path)
        self.save_normal_tweets(test_tweet_list,
                                test_emoji_list,
                                output_path=self.test_output_path)
    def run(self,
            adding_noise=False,
            lr_decay=False,
            augmentation=False,
            test_augmentation=False):
        """
        run method is written for running model
        """
        data_set = self.load_data_set()
        model, criterion, optimizer = self.init_model(data_set)

        best_validation_loss = float("inf")
        best_test_f_score = 0.0

        best_val_loss_model = ""
        best_test_f_score_model = ""

        losses_dict = dict()
        acc_dict = dict()
        losses_dict["train_loss"] = []
        losses_dict["validation_loss"] = []
        losses_dict["test_loss"] = []
        acc_dict["train_acc"] = []
        acc_dict["validation_acc"] = []
        acc_dict["test_acc"] = []

        augmentation_class = None
        augmentation_methods = None
        # call augmentation class
        if augmentation:
            augmentation_class, augmentation_methods = self.create_augmentation(
                data_set)

        # start training model
        for epoch in range(N_EPOCHS):
            start_time = time.time()

            # adding noise to fully connected layers
            if adding_noise:
                with torch.no_grad():
                    for name, param in model.named_parameters():
                        if name.startswith("w_s1") or name.startswith("w_s2"):
                            param.add_(torch.randn(param.size()).to(DEVICE))

            # train model on train data
            if augmentation:
                train(model=model,
                      iterator=data_set.iterator_dict["train_iterator"],
                      optimizer=optimizer,
                      criterion=criterion,
                      epoch=epoch,
                      augmentation_class=augmentation_class,
                      augmentation_methods=augmentation_methods,
                      lr_decay=lr_decay)
            else:
                train(model=model,
                      iterator=data_set.iterator_dict["train_iterator"],
                      optimizer=optimizer,
                      criterion=criterion,
                      epoch=epoch,
                      lr_decay=lr_decay)

            # compute model result on train data
            train_log_dict = evaluate(
                model=model,
                iterator=data_set.iterator_dict["train_iterator"],
                criterion=criterion)

            losses_dict["train_loss"].append(train_log_dict["loss"])
            acc_dict["train_acc"].append(train_log_dict["acc"])

            # compute model result on validation data
            valid_log_dict = evaluate(
                model=model,
                iterator=data_set.iterator_dict["valid_iterator"],
                criterion=criterion)

            losses_dict["validation_loss"].append(valid_log_dict["loss"])
            acc_dict["validation_acc"].append(valid_log_dict["acc"])

            # compute model result on test data
            test_log_dict = evaluate(
                model=model,
                iterator=data_set.iterator_dict["test_iterator"],
                criterion=criterion)

            losses_dict["test_loss"].append(test_log_dict["loss"])
            acc_dict["test_acc"].append(test_log_dict["acc"])

            end_time = time.time()

            # calculate epoch time
            epoch_mins, epoch_secs = process_time(start_time, end_time)

            # save model when loss in validation data is decrease
            if valid_log_dict["loss"] < best_validation_loss:
                best_validation_loss = valid_log_dict["loss"]
                torch.save(
                    model.state_dict(),
                    MODEL_PATH + f"model_epoch{epoch + 1}_loss_"
                    f"{valid_log_dict['loss']}.pt")
                best_val_loss_model = f"model_epoch{epoch + 1}_loss_" \
                    f"{valid_log_dict['loss']}.pt"

            # save model when fscore in test data is increase
            if test_log_dict["total_fscore"] > best_test_f_score:
                best_test_f_score = test_log_dict["total_fscore"]
                torch.save(
                    model.state_dict(), MODEL_PATH + f"model_epoch{epoch + 1}"
                    f"_fscore_{test_log_dict['total_fscore']}.pt")
                best_test_f_score_model = f"model_epoch{epoch + 1}" \
                    f"_fscore_{test_log_dict['total_fscore']}.pt"

            # show model result
            logging.info(
                f"Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s"
            )
            model_result_log(train_log_dict, valid_log_dict, test_log_dict)

            # save model result in log file
            self.log_file.write(f"Epoch: {epoch + 1:02} | Epoch Time: "
                                f"{epoch_mins}m {epoch_secs}s\n")
            model_result_save(self.log_file, train_log_dict, valid_log_dict,
                              test_log_dict)

        # save final model
        torch.save(model.state_dict(), MODEL_PATH + "final_model.pt")

        # test augmentation
        if test_augmentation:
            if not augmentation:
                augmentation_class, _ = self.create_augmentation(data_set)
            self.eval_test_augmentation(
                best_val_loss_model=best_val_loss_model,
                best_test_f_score_model=best_test_f_score_model,
                data_set=data_set,
                aug_class=augmentation_class)

        # plot curve
        self.draw_curves(train_acc=acc_dict["train_acc"],
                         validation_acc=acc_dict["validation_acc"],
                         test_acc=acc_dict["test_acc"],
                         train_loss=losses_dict["train_loss"],
                         validation_loss=losses_dict["validation_loss"],
                         test_loss=losses_dict["test_loss"])
    def run(self,
            model_name,
            lr_decay=False,
            augmentation=False,
            test_augmentation=False):
        """
        run method is written for running model
        """
        # select model
        model_config = dict()
        if model_name == "bert":
            model_config = BERT_CONFIG
        elif model_name == "parsbert":
            model_config = PARSBERT_CONFIG
        elif model_name == "albert":
            model_config = ALBERT_CONFIG

        # open log file
        self.log_file = open(model_config["log_path"], "w")

        # load data_set iterators
        data_set = self.load_data_set(model_config)
        # create model
        model, criterion, optimizer = self.init_model(data_set, model_config)

        best_validation_loss = float("inf")
        best_test_f_score = 0.0

        best_val_loss_model = ""
        best_test_f_score_model = ""

        losses_dict = dict()
        acc_dict = dict()
        losses_dict["train_loss"] = []
        losses_dict["dev_loss"] = []
        losses_dict["test_loss"] = []
        acc_dict["train_acc"] = []
        acc_dict["dev_acc"] = []
        acc_dict["test_acc"] = []

        augmentation_class = None
        augmentation_methods = None

        # call augmentation class
        if augmentation:
            augmentation_class, augmentation_methods = self.create_augmentation(
                data_set)

        # start training model
        for epoch in range(N_EPOCHS):
            start_time = time.time()

            # train model on train data
            if augmentation:
                train(model=model,
                      iterator=data_set.iterator_dict["train_iterator"],
                      optimizer=optimizer,
                      criterion=criterion,
                      epoch=epoch,
                      augmentation_class=augmentation_class,
                      augmentation_methods=augmentation_methods,
                      lr_decay=lr_decay)
            else:
                train(model=model,
                      iterator=data_set.iterator_dict["train_iterator"],
                      optimizer=optimizer,
                      criterion=criterion,
                      epoch=epoch,
                      lr_decay=lr_decay)

            # compute model result on train data
            train_log_dict = evaluate(
                model=model,
                iterator=data_set.iterator_dict["train_iterator"],
                criterion=criterion)

            losses_dict["train_loss"].append(train_log_dict["loss"])
            acc_dict["train_acc"].append(train_log_dict["acc"])

            # compute model result on dev data
            valid_log_dict = evaluate(
                model=model,
                iterator=data_set.iterator_dict["valid_iterator"],
                criterion=criterion)

            losses_dict["dev_loss"].append(valid_log_dict["loss"])
            acc_dict["dev_acc"].append(valid_log_dict["acc"])

            # compute model result on test data
            test_log_dict = evaluate(
                model=model,
                iterator=data_set.iterator_dict["test_iterator"],
                criterion=criterion)

            losses_dict["test_loss"].append(test_log_dict["loss"])
            acc_dict["test_acc"].append(test_log_dict["acc"])

            end_time = time.time()

            # calculate epoch time
            epoch_mins, epoch_secs = process_time(start_time, end_time)

            # save model when loss in validation data is decrease
            if valid_log_dict["loss"] < best_validation_loss:
                best_validation_loss = valid_log_dict["loss"]
                torch.save(
                    model.state_dict(), model_config["save_model_path"] +
                    f"model_epoch{epoch + 1}_loss_"
                    f"{valid_log_dict['loss']}.pt")
                best_val_loss_model = f"model_epoch{epoch + 1}_loss_" \
                    f"{valid_log_dict['loss']}.pt"

            # save model when fscore of test data is increase
            if test_log_dict["total_fscore"] > best_test_f_score:
                best_test_f_score = test_log_dict["total_fscore"]
                torch.save(
                    model.state_dict(),
                    model_config["save_model_path"] + f"model_epoch{epoch + 1}"
                    f"_fscore_{test_log_dict['total_fscore']}.pt")
                best_test_f_score_model = f"model_epoch{epoch + 1}" \
                    f"_fscore_{test_log_dict['total_fscore']}.pt"

            # show model result
            logging.info(
                f"Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s"
            )
            model_result_log(train_log_dict, test_log_dict, test_log_dict)

            # save model result in log file
            self.log_file.write(f"Epoch: {epoch + 1:02} | Epoch Time: "
                                f"{epoch_mins}m {epoch_secs}s\n")
            model_result_save(self.log_file, train_log_dict, test_log_dict,
                              test_log_dict)

        # save final model
        torch.save(model.state_dict(),
                   model_config["save_model_path"] + "final_model.pt")

        # test augmentation
        if test_augmentation:
            self.eval_test_augmentation(
                best_val_loss_model=best_val_loss_model,
                best_test_f_score_model=best_test_f_score_model,
                data_set=data_set,
                aug_class=augmentation_class,
                model_config=model_config)

        # plot curve
        self.draw_curves(train_acc=acc_dict["train_acc"],
                         validation_acc=acc_dict["dev_acc"],
                         test_acc=acc_dict["test_acc"],
                         train_loss=losses_dict["train_loss"],
                         validation_loss=losses_dict["dev_loss"],
                         test_loss=losses_dict["test_loss"],
                         model_config=model_config)