Ejemplo n.º 1
0
def get_instance_from_link(q1, q2, vocab, get_rel=True):
    """

    :param q1:  The pointer of xml files
    :param vocab:  vocaburary
    :return: instance: instance; label : int;
    """
    q1_subject = Preprocessing(
        q1.getElementsByTagName('OrgQSubject')[0].firstChild.data)
    try:
        question1 = Preprocessing(
            q1.getElementsByTagName('OrgQBody')[0].firstChild.data)
    except:
        question1 = ''
    q2_subject = Preprocessing(
        q2.getElementsByTagName('RelQSubject')[0].firstChild.data)
    q2_relevance = q2.getAttribute('RELQ_RELEVANCE2ORGQ')
    try:
        question2 = Preprocessing(
            q2.getElementsByTagName('RelQBody')[0].firstChild.data)
    except:
        question2 = ''
    OrgQ = q1_subject + " " + question1
    RelQ = q2_subject + " " + question2
    x = datareader.text_to_instance(OrgQ, RelQ)
    x.index_fields(vocab)
    if get_rel:
        return x, label_dict[q2_relevance]
    else:
        return x
Ejemplo n.º 2
0
def main():
    download_data()
    df = get_df()
    df['overall'] -= 1
    preprocessing = Preprocessing()

    for index in tqdm(range(len(df))):
        review = df.loc[index, 'reviewText']
        df.loc[index, 'reviewText'] = preprocessing.preprocess_text(review)

    df.to_csv("./data/dataset2.csv")
Ejemplo n.º 3
0
    def get_block_index(self, block: List[str], doc_id_doc_name_dict: Dict):
        """
        строим обратный индекс для блока документов и сохраняем его на диск,
        обновляем хэш с доп.информацией
        :param block: блок документов
        :param doc_id_doc_name_dict: хэш с доп.информацией
        :return:
        """
        block_index: Dict[str, List] = dict()
        # определяем article_id документа
        if len(doc_id_doc_name_dict.keys()) == 0:
            doc_id: int = 0
        else:
            doc_id: int = max(doc_id_doc_name_dict.keys()) + 1
        for doc in block:
            doc_path: str = f'{c.PATH_TO_CORPUS}{doc}'
            # преобразуем документ в массив термов
            doc_terms: List[str] = Preprocessing.get_terms(doc_path)
            # обновляем блочный индекс и строим хэш с частотами термов
            # в документе
            doc_tf_dict: Dict = self.update_index(block_index, doc_terms,
                                                  doc_id)
            # сопоставляем название документа и хэш с частотами термов
            temp_dict: Dict[str, Dict] = dict()
            temp_dict[doc] = doc_tf_dict
            # обновляем хэш с доп.информацией
            doc_id_doc_name_dict[doc_id] = temp_dict

            doc_id += 1

        with open(f'{c.TEMP_DIR}index_{doc_id - 1}.pickle', 'wb') as f:
            pickle.dump(block_index, f)
def preprpcessing():
    datas = Preprocessing()
    datas.encode_all()
    datas.standardize()

    train_ds, val_ds = data_loader(datas.sales_df, datas.calendar_df)
    return train_ds, val_ds
Ejemplo n.º 5
0
def predict_single_review(review_text: str, preprocessing: Preprocessing,
                          tokenizer, model, device):
    preprocessed_text = preprocessing.preprocess_text(review_text)
    encoded_review = tokenizer.encode_plus(preprocessed_text,
                                           max_length=TOKEN_MAX_LEN,
                                           add_special_tokens=True,
                                           return_token_type_ids=False,
                                           pad_to_max_length=True,
                                           return_attention_mask=True,
                                           return_tensors='pt',
                                           verbose=False)
    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)

    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)

    print(f'Review text: {review_text}')
    print(f'Processed review text: {preprocessed_text}')
    print(f'Sentiment  : {class_names[prediction]}')
Ejemplo n.º 6
0
from fileIO import FileIO
from preprocess import Preprocessing
from decisionTree import DecisionTree


if __name__ == '__main__':
    filename = 'house-votes-84.data.txt'
    fileio = FileIO()
    data = fileio.read_csv(filename)

    preprocessing = Preprocessing()
    preprocessing.assume_missing_values(data)
    for percent in range(3, 8):
        training_data, testing_data = preprocessing.split_into_training_and_testing(data, percent/float(10))
        attributes_number = len(training_data[0]) - 1
        decision_tree = DecisionTree()
        root_node = decision_tree.build(training_data)
        # decision_tree.print()
        # print("Classification: ")
        accuracy = 0
        for row in testing_data:
            classified = decision_tree.classify(row, decision_tree.root)
            classified.calc_percentages(len(testing_data))
            if classified.republicans_percent > 50.0 and row[0] == 'republican' or (
                    classified.democrats_percent > 50.0 and row[0] == 'democrat'):
                accuracy += 1

        accuracy = accuracy / float(len(testing_data))
        print("Accuracy using training data", percent/float(10)*100, "% is: ", accuracy)
Ejemplo n.º 7
0
def main():
    if not os.path.exists(DATASET_PATH):
        download_dataset()
    df = pd.read_csv(DATASET_PATH)

    tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

    df['overall'] -= 1
    df_train, df_test = train_test_split(df,
                                         test_size=0.25,
                                         random_state=RANDOM_SEED,
                                         stratify=df[['overall']])
    train_data_loader = create_data_loader(df_train, tokenizer, TOKEN_MAX_LEN,
                                           BATCH_SIZE)
    test_data_loader = create_data_loader(df_test, tokenizer, TOKEN_MAX_LEN,
                                          BATCH_SIZE)

    model = SentimentClassifier(len(class_names), PRE_TRAINED_MODEL_NAME)
    model = model.to(device)

    optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
    total_steps = len(train_data_loader) * EPOCHS

    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=total_steps)

    # class weights for loss function for imbalanced problem
    class_weights = compute_class_weight(classes=[0, 1, 2, 3, 4],
                                         y=df_train['overall'],
                                         class_weight='balanced')
    class_weights = torch.FloatTensor(class_weights).to(device)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights).to(device)

    history = defaultdict(list)
    best_accuracy = 0

    for epoch in range(EPOCHS):
        print(f'Epoch {epoch + 1}/{EPOCHS}')
        print('-' * 10)

        train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn,
                                            optimizer, device, scheduler,
                                            len(df_train))

        print(f'Train loss {train_loss} accuracy {train_acc}')

        val_acc, val_loss = eval_model(model, test_data_loader, loss_fn,
                                       device, len(df_test))

        print(f'Val loss {val_loss} accuracy {val_acc}')
        print()

        history['train_acc'].append(train_acc)
        history['train_loss'].append(train_loss)
        history['val_acc'].append(val_acc)
        history['val_loss'].append(val_loss)

        if val_acc > best_accuracy:
            torch.save(model.state_dict(), 'best_model_state.bin')
            best_accuracy = val_acc

    plot_history(history)

    test_acc, _ = eval_model(model, test_data_loader, loss_fn, device,
                             len(df_test))

    y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
        model, test_data_loader, device)

    os.makedirs("model", exist_ok=True)
    torch.save(model.state_dict(), "model/model.pt")

    show_metrics(y_pred, y_pred_probs, y_test)

    preprocessing = Preprocessing()
    predict_single_review("I like it, perfect", preprocessing, tokenizer,
                          model, device)
Ejemplo n.º 8
0
    #Log setting
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    log_file = './log/' + args.log_file
    file_handler = logging.FileHandler(log_file)
    file_handler.setLevel(logging.INFO)
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)

    #Log Info
    logger.info('Device: {}'.format(args.device))

    pp = Preprocessing()

    assert args.nu >= 0 and args.nu <= 1
    assert args.svdd_mode in ['one-class', 'soft']

    train_data = MNIST(root='./',
                       train=True,
                       download=True,
                       transform=ToTensor())
    test_data = MNIST(root='./',
                      train=False,
                      download=True,
                      transform=ToTensor())

    train_data, train_label = train_data.data, train_data.targets
    train_data = pp.normalize(train_data, fit=True)
Ejemplo n.º 9
0
if c.BUILD_INDEX:
    idx = GetIndex()
    idx.save_block_index()
    idx.combine_block_index()
# читаем с диска индекс и хэш с доп.информацией
with open('index/full_index.pickle', 'rb') as f:
    index: Dict[int, List] = pickle.load(f)
with open('index/doc_id_doc_name_dict.pickle', 'rb') as f:
    doc_id_doc_name_dict: Dict = pickle.load(f)

while True:
    print('\n\nInput your boolean query:')
    query_string: str = input('>')
    query_list: List[str] = query_string.split()
    # преобразуем в список массивов article_id и операций над ними
    arrays_and_operators, term_list = pp.process_query(query_list, index,
                                                       doc_id_doc_name_dict)
    # считываем первый массив
    result: Union[str, List] = arrays_and_operators.pop(0)
    # пока в списке есть элементы
    while len(arrays_and_operators) > 0:
        operator: str = arrays_and_operators.pop(0)
        array: List[int] = arrays_and_operators.pop(0)

        if operator == 'AND':
            result: List[int] = bs.intersect(result, array)
        elif operator == 'OR':
            result: List[int] = bs.union(result, array)
    if len(result) == 0:
        print('No such results')
        continue
    # ранжируем результаты поиска
Ejemplo n.º 10
0
    def load_data(self, level="theme", label_name="", include_test=False):
        """
        Auxiliar function used to load the datasets of themes or a 
        specific Sub-theme, preprocess the X datasets and calculate
        the basic parameters that will use in other functions of
        this class.

        Parameters
        -------------
        level : (str)
            options are 'theme' and 'subtheme'
        label_name : (str)
            code of the sub-theme
        include_test : (boolean)
            True/False option to include or not the test dataset
        
        Returns
        -------------
        Nothing
        
        Example
        -------------
        from embeddings import Embeddings
        model = Embeddings()
        model.load_data(level="subtheme", label_name="FWE")
        """
        # load data
        if level == "theme":
            self.root = 'data/interim/question1_models/advance/'
            self.root_q2 = 'data/interim/question2_models/'
            exten = '.xlsx'
        else:
            self.root = 'data/interim/subthemes/' + label_name + '/'
            exten = '_subset.xlsx'

        self.X_train = pd.read_excel(self.root + 'X_train' +
                                     exten)['Comment'].tolist()
        self.X_valid = pd.read_excel(self.root + 'X_valid' +
                                     exten)['Comment'].tolist()
        self.y_train = pd.read_excel(self.root + 'y_train' + exten)
        self.y_valid = pd.read_excel(self.root + 'y_valid' + exten)

        if include_test:
            self.X_test = pd.read_excel(self.root + 'X_test' +
                                        exten)['Comment'].tolist()
            self.y_test = pd.read_excel(self.root + 'y_test' + exten)

        if level == "theme":
            self.data_q2 = pd.read_excel(self.root_q2 + 'comments_q2' +
                                         exten)['Comment'].tolist()
            self.data_2015 = pd.read_excel(self.root + 'data_2015' +
                                           exten)['Comment'].tolist()

        print('Loading: files were sucessfuly loaded')

        # # checking loaded data is not empty
        assert len(self.X_train) > 0, 'no records in X_train'
        assert len(self.X_valid) > 0, 'no records in X_valid'
        assert len(self.y_train) > 0, 'no records in y_train'
        assert len(self.y_valid) > 0, 'no records in y_valid'

        print('Preprocess: this step could take time, please be patient')
        self.X_train = Preprocessing().general(self.X_train)
        self.X_valid = Preprocessing().general(self.X_valid)
        if include_test:
            self.X_test = Preprocessing().general(self.X_test)
        if level == "theme":
            self.data_q2 = Preprocessing().general(self.data_q2)
            self.data_2015 = Preprocessing().general(self.data_2015)

        # Get parameters
        self.max_len = max(len(comment.split()) for comment in self.X_train)
        self.vect = Tokenizer()
        self.vect.fit_on_texts(self.X_train)
        self.vocab_size = len(self.vect.word_index) + 1
        return
Ejemplo n.º 11
0
def main(input_dir, output_dir):
    """
    This function loads files from input_dir, makes theme and subtheme predictions
    based on the saved models and saves an excel file with predictions in the output_dir
    """
    print("\n--- START: predict_new_comment.py ---\n")

    print("**Loading the data**")
    ## Reading new comments data
    try:
        new_comments = pd.read_excel(input_dir + '/new_comments.xlsx')
    except:
        print("File new_comments.xlsx not found.\n")
        print("--- END: predict_new_comments.py ---\n")
        return

    ## Load training data
    X_train = pd.read_excel(
        'data/interim/question1_models/advance/X_train.xlsx')

    ## Load y_train and extract column names for themes and subthemes
    y_train = pd.read_excel(
        'data/interim/question1_models/advance/y_train.xlsx')
    theme_names = y_train.rename(columns={'FEW': 'FWE'}).iloc[:, :12].columns
    subthemes = y_train.iloc[:, 12:-1].columns

    print('**Preprocessing: this step could take time, please be patient.**')
    X_train = Preprocessing().general(X_train['Comment'])
    new_comments_ppd = Preprocessing().general(new_comments['Comment'])
    new_comment_ppd_df = pd.DataFrame(new_comments_ppd, columns=['Comment'])

    ## Get parameters
    print('**Computing the required parameters**')
    max_len = max(len(comment.split()) for comment in X_train)
    vect = Tokenizer()
    vect.fit_on_texts(X_train)

    encoded_new_comments = vect.texts_to_sequences(new_comments_ppd)
    padded_new_comments = pad_sequences(encoded_new_comments,
                                        maxlen=max_len,
                                        padding='post')

    ## Loading saved model
    print('**Loading the saved theme model**')
    theme_model = tf.keras.models.load_model('models/Theme_Model/theme_model')
    print("**Making the theme predictions**")
    pred_themes_array = theme_model.predict(padded_new_comments)
    pred_themes_array = (pred_themes_array > 0.4) * 1

    ## Making dataframe of prediction
    pred_themes = pd.DataFrame(pred_themes_array, columns=theme_names)

    print(
        "**Theme predictions are successfully done. Predicting subthemes now.**\n"
    )

    ## Creating dictionary with theme indices as keys predicted comment indices as values
    ind_dict = dict()
    for i in range(pred_themes_array.shape[1]):
        ind_dict[i] = np.where(pred_themes_array[:, i] == 1)[0]

    ## Creating 2d zero array of size (#comments x 62)
    zero_arrays = np.zeros((pred_themes_array.shape[0], 62))

    subtheme_pos = dict()

    count_i = 0

    for i in range(len(theme_names)):
        count_a = count_i
        for sublab in subthemes:
            if sublab.startswith(theme_names[i]):
                count_i += 1
        subtheme_pos[i] = range(count_a, count_i)

    ## Creating dictionary for theme names and theme indices
    theme_dict = dict()
    model_dict = dict()
    for i in range(len(theme_names)):
        model_dict[i] = str(theme_names[i]).lower() + '_model'
        theme_dict[i] = str(theme_names[i])

    ## Loop for predicting subthemes
    pred_subthemes = dict()
    pred_thresh = {
        0: 0.4,
        1: 0.4,
        2: 0.3,
        3: 0.4,
        4: 0.5,
        5: 0.3,
        6: 0.4,
        7: 0.4,
        8: 0.4,
        9: 0.3,
        10: 0.3,
        11: 0.4
    }

    for i in list(ind_dict.keys()):

        print("**Predicting subthemes for comments classified as label",
              theme_dict[i], "**")

        # subset comments for predicted label
        # print("comment_subsets\n", new_comments_ppd)
        comments_subset = new_comment_ppd_df.iloc[
            ind_dict[i]]  ## MAY BE DOESN'T NEED ILOC

        # load respective train set for predicted label
        input_dir_1 = 'data/interim/subthemes/' + str(theme_dict[i])
        x_train = pd.read_excel(input_dir_1 + '/X_train_subset.xlsx')

        # Preprocessing comments and x_train
        print(
            "**Preprocessing training set for this label. This may take a little time**"
        )
        x_train = Preprocessing().general(x_train['Comment'])
        # comments_subset = Preprocessing().general(comments_subset['Comment'])

        # Getting parameters
        print("**Getting the required parameters now**")
        max_len = max(len(comment.split()) for comment in x_train)
        vect = Tokenizer()
        vect.fit_on_texts(x_train)

        # Padding comments
        encoded_docs_comments = vect.texts_to_sequences(
            comments_subset['Comment'])
        padded_docs_comments = pad_sequences(encoded_docs_comments,
                                             maxlen=max_len,
                                             padding='post')

        # loading model
        print("**Loading saved model for theme", model_dict[i], "**")
        model = tf.keras.models.load_model('models/Subtheme_Models/' +
                                           model_dict[i])

        # Predictions
        print("**Predicting subthemes for comments**")
        try:
            pred = model.predict(padded_docs_comments)
            pred = (pred > pred_thresh[i]) * 1
            pred_subthemes[i] = pred
            for j in range(pred_subthemes[i].shape[0]):
                zero_arrays[ind_dict[i][j],
                            subtheme_pos[i]] += pred_subthemes[i][j]
        except:
            next
        print("Predictions for subthemes of ", theme_dict[i], "are completed!")
        print('-----------------------------------')

    print("**Subtheme predictions are successfully done**")
    subtheme_pred = pd.DataFrame(zero_arrays, columns=subthemes)

    final_pred = pd.concat(
        [pd.Series(new_comments['Comment']), pred_themes, subtheme_pred],
        axis=1)
    final_pred.to_excel(output_dir + '/predictions.xlsx')
    print("**Predictions have been saved to", output_dir, "**\n")
    print("--- END: predict_new_comments.py ---\n")

    return
Ejemplo n.º 12
0
import sys

sys.path.insert(0, "..")

from util import BERT_path, download_bert
from transformers import BertTokenizer

from preprocess import Preprocessing
from predict_review.predict_review import predict_single_review
from SentimentClassifier import SentimentClassifier
from consts import PRE_TRAINED_MODEL_NAME, class_names

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if __name__ == "__main__":
    tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
    preprocessing = Preprocessing()
    model = SentimentClassifier(len(class_names), PRE_TRAINED_MODEL_NAME)
    model.to(device)
    if not os.path.exists(BERT_path):
        download_bert()

    model.state_dict(torch.load(BERT_path))
    model.eval()
    print("BERT Sentiment Analyzer.")
    review = input("Please enter your review (or 'q' to exit):\n")
    while review != 'q':
        predict_single_review(review, preprocessing, tokenizer, model, device)
        print("_____________________________________________________")
        review = input("Please enter your review (or 'q' to exit):\n")
Ejemplo n.º 13
0
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import load_model
from preprocess import Preprocessing
from model import Model

# Argument is PATH to dataset
pr = Preprocessing('/scratch/smuthi2s/NLP_data/books/books_large_p1.txt')
# pr = Preprocessing('data.txt')
pr.load_data(num_lines=60000)
pr.encode_data()
pr.generate_sequence()
pr.get_data()
print("Maximum length of sequence : ", pr.get_max_length())
pr.save_config()

params = {
    "activation": "softmax",
    "epochs": 100,
    "verbose": 2,
    "loss": "categorical_crossentropy",
    "optimizer": "adam",
    "metrics": ['accuracy'],
Ejemplo n.º 14
0
def main(input_dir, output_dir):
    """
    This function loads files from input_dir, makes subtheme predictions based on the saved models
    and saves an evaluations on test set in the output_dir
    """
    assert os.path.exists(
        input_dir
    ), "The path entered for input_dir does not exist. Make sure to enter correct path \n"
    assert os.path.exists(
        output_dir
    ), "The path entered for output_dir does not exist. Make sure to enter correct path \n"

    print("----START: predict_subtheme.py----\n")
    print("**Loading data and generating necessary dictionaries**")
    ## Reading the comment prediction (.npy file)
    theme_pred = np.load(input_dir +
                         'output/theme_predictions/theme_question1_test.npy')

    ## Reading in the input comments
    X_test = pd.read_excel(input_dir +
                           'interim/question1_models/advance/X_test.xlsx')
    assert len(X_test) > 0, 'no records in X_test.xlsx'

    ## Reading y_test
    y_test = pd.read_excel(input_dir +
                           'interim/question1_models/advance/y_test.xlsx')
    assert len(y_test) > 0, 'no records in y_test.xlsx'

    y_test_subthemes = y_test.iloc[:, 12:-1]

    ## Creating dictionary with theme indices as keys predicted comment indices as values
    ind_dict = dict()
    for i in range(theme_pred.shape[1]):
        ind_dict[i] = np.where(theme_pred[:, i] == 1)[0]

    ## Creating 2d zero array of size (#comments x 62)
    zero_arrays = np.zeros((theme_pred.shape[0], 62))

    ## Creating dictionary for subtheme range of columns
    theme_names = y_test.rename(columns={'FEW': 'FWE'}).iloc[:, :12].columns
    subthemes = y_test.iloc[:, 12:-1].columns
    subtheme_pos = dict()

    count_i = 0

    for i in range(len(theme_names)):
        count_a = count_i
        for sublab in subthemes:
            if sublab.startswith(theme_names[i]):
                count_i += 1
        subtheme_pos[i] = range(count_a, count_i)

    ## Creating dictionary for theme names and theme indices
    theme_dict = dict()
    model_dict = dict()
    for i in range(len(theme_names)):
        model_dict[i] = str(theme_names[i]).lower() + '_model'
        theme_dict[i] = str(theme_names[i])

    pred_thresh = {
        0: 0.4,
        1: 0.4,
        2: 0.3,
        3: 0.4,
        4: 0.5,
        5: 0.3,
        6: 0.4,
        7: 0.4,
        8: 0.4,
        9: 0.3,
        10: 0.3,
        11: 0.4
    }

    ## Loop for predicting subthemes
    pred_subthemes = {}
    for i in list(ind_dict.keys()):

        print("**Predicting subthemes for comments classified as label",
              theme_dict[i], "**")
        print("**Subsetting the comments data**")

        # subset comments for predicted label
        comments_subset = X_test.iloc[ind_dict[i]]

        # load respective train set for predicted label
        input_dir_1 = input_dir + '/interim/subthemes/' + str(theme_dict[i])
        x_train = pd.read_excel(input_dir_1 + '/X_train_subset.xlsx')

        # Preprocessing comments and x_train
        print(
            "**Preprocessing X_test and training set for label. This may take a little time**"
        )
        x_train = Preprocessing().general(x_train['Comment'])
        comments_subset = Preprocessing().general(comments_subset['Comment'])

        # Getting parameters
        print("**Getting the required parameters now!!**")
        max_len = max(len(comment.split()) for comment in x_train)
        vect = Tokenizer()
        vect.fit_on_texts(x_train)

        # Padding comments
        encoded_docs_comments = vect.texts_to_sequences(comments_subset)
        padded_docs_comments = pad_sequences(encoded_docs_comments,
                                             maxlen=max_len,
                                             padding='post')

        # loading model
        print("**Loading saved model for theme", model_dict[i], "**")
        model = tf.keras.models.load_model(input_dir +
                                           '/../models/Subtheme_Models/' +
                                           model_dict[i])

        # Predictions
        print("**Predicting subthemes for comments**")
        pred = model.predict(padded_docs_comments)
        pred = (pred > pred_thresh[i]) * 1

        pred_subthemes[i] = pred
        for j in range(pred_subthemes[i].shape[0]):
            zero_arrays[ind_dict[i][j],
                        subtheme_pos[i]] += pred_subthemes[i][j]
        print("**Predictions for subthemes of ", theme_dict[i],
              "are completed!**")
        print('\n')

    accuracy = []
    precision = []
    recall = []
    subtheme_model = []
    f1 = []
    for i in range(12):
        subtheme_model.append(theme_dict[i])
        accuracy.append(
            accuracy_score(
                np.asarray(y_test_subthemes.iloc[:, subtheme_pos[i]]),
                zero_arrays[:, subtheme_pos[i]]))
        precision.append(
            precision_score(np.asarray(y_test_subthemes.iloc[:,
                                                             subtheme_pos[i]]),
                            zero_arrays[:, subtheme_pos[i]],
                            average='micro'))
        recall.append(
            recall_score(np.asarray(y_test_subthemes.iloc[:, subtheme_pos[i]]),
                         zero_arrays[:, subtheme_pos[i]],
                         average='micro'))
        f1.append(
            f1_score(np.asarray(y_test_subthemes.iloc[:, subtheme_pos[i]]),
                     zero_arrays[:, subtheme_pos[i]],
                     average='micro'))

    results = pd.DataFrame(
        data={
            'Subtheme_model': subtheme_model,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1
        })
    results.to_csv(output_dir + 'subtheme_pred_results.csv')
    print("**Results of test set subtheme predictions are saved in",
          output_dir, "**")
    print("----END: predict_subtheme.py----")