def accuracy_score(self, folder, data):
     correct = 0
     total = 0
     correct_by_part = []
     total_by_part = []
     true_pred_dataset = pd.DataFrame(columns=['token', 'true', 'pred'])
     for index, sequence in enumerate(data):
         predicted_tags = self.viterbi(list(map(lambda x: x[0], sequence)))
         tag_acquired = ' '.join(
             [str(self.tags[tag]) for tag in predicted_tags])
         if (tag_acquired == data[index][0][1]):
             correct = correct + 1
             correct_by_part.append(data[index][0][1])
         total = total + 1
         total_by_part.append(data[index][0][1])
         true_pred_dataset.loc[index] = [
             data[index][0][0], data[index][0][1], tag_acquired
         ]
     true_pred_dataset.to_csv(folder + "\\" + 'res.csv', index=False)
     accuracy(correct_by_part, total_by_part, correct, total)
     build_confusion_matrices(true_pred_dataset)
Ejemplo n.º 2
0
def tree_tag(data, folder, lang):
    try:
        tagger = treetaggerwrapper.TreeTagger(TAGLANG=lang)
        correct = 0
        total = 0
        correct_by_part = []
        total_by_part = []
        true_pred_dataset = pd.DataFrame(columns=['true', 'pred'])
        for index, sequence in enumerate(data):
            tags = tagger.tag_text(sequence[0][0])
            tags2 = treetaggerwrapper.make_tags(tags)
            tag_acquired = tags2[0].pos
            if (lang == 'bg'):
                if (tag_acquired.startswith('A')
                        or tag_acquired.startswith('Mo')
                        or tag_acquired.startswith('Md')
                        or tag_acquired.startswith('My')
                        or tag_acquired.startswith('H')):
                    tag_acquired = 'ADJ'
                elif (tag_acquired.startswith('D')):
                    tag_acquired = 'ADV'
                elif (tag_acquired.startswith('I')):
                    tag_acquired = 'INTJ'
                elif (tag_acquired.startswith('Nc')):
                    tag_acquired = 'NOUN'
                elif (tag_acquired.startswith('Np')):
                    tag_acquired = 'PROPN'
                elif (tag_acquired.startswith('Vn')
                      or tag_acquired.startswith('Vp')):
                    tag_acquired = 'VERB'
                elif (tag_acquired.startswith('R')):
                    tag_acquired = 'ADP'
                elif (tag_acquired.startswith('Vx')
                      or tag_acquired.startswith('Vy')
                      or tag_acquired.startswith('Vi')):
                    tag_acquired = 'AUX'
                elif (tag_acquired.startswith('Cc')
                      or tag_acquired.startswith('Cr')
                      or tag_acquired.startswith('Cp')):
                    tag_acquired = 'CCONJ'
                elif (tag_acquired.startswith('Ps')):
                    tag_acquired = 'DET'
                elif (tag_acquired.startswith('Mc')):
                    tag_acquired = 'NUM'
                elif (tag_acquired.startswith('T')):
                    tag_acquired = 'PART'
                elif (tag_acquired.startswith('Pp')
                      or tag_acquired.startswith('Pd')
                      or tag_acquired.startswith('Pr')
                      or tag_acquired.startswith('Pc')
                      or tag_acquired.startswith('Pi')
                      or tag_acquired.startswith('Pf')
                      or tag_acquired.startswith('Pn')):
                    tag_acquired = 'PRON'
                elif (tag_acquired.startswith('Cs')):
                    tag_acquired = 'SCONJ'
                else:
                    tag_acquired = 'X'
            elif (lang == 'ru'):
                if (tag_acquired.startswith('A')
                        or (tag_acquired.startswith('Mc'))):
                    tag_acquired = 'ADJ'
                elif (tag_acquired == 'C'):
                    tag_acquired = 'CCONJ'
                elif (tag_acquired == 'I'):
                    tag_acquired = 'INTJ'
                elif (tag_acquired.startswith('Mo')
                      or tag_acquired.startswith('Mp')):
                    tag_acquired = 'NUM'
                elif (tag_acquired.startswith('Nc')):
                    tag_acquired = 'NOUN'
                elif (tag_acquired.startswith('Np')):
                    tag_acquired = 'PROPN'
                elif (tag_acquired.startswith('P')):
                    tag_acquired = 'PRON'
                elif (tag_acquired == 'Q'):
                    tag_acquired = 'PART'
                elif (tag_acquired.startswith('R')):
                    tag_acquired = 'ADV'
                elif (tag_acquired.startswith('S')):
                    tag_acquired = 'ADP'
                elif (tag_acquired.startswith('V')):
                    tag_acquired = 'VERB'
                else:
                    tag_acquired = 'X'
            elif (lang == 'sk'):
                if (tag_acquired.startswith('S')):
                    tag_acquired = 'NOUN'
                elif (tag_acquired.startswith('A')):
                    tag_acquired = 'ADJ'
                elif (tag_acquired.startswith('P')
                      or tag_acquired.startswith('R')):
                    tag_acquired = 'PRON'
                elif (tag_acquired.startswith('N')):
                    tag_acquired = 'NUM'
                elif (tag_acquired.startswith('V')):
                    if (tag_acquired.startswith('VB')):
                        tag_acquired = 'AUX'
                    else:
                        tag_acquired = 'VERB'
                elif (tag_acquired.startswith('G')
                      or tag_acquired.startswith('Y')):
                    tag_acquired = 'VERB'
                elif (tag_acquired.startswith('D')):
                    tag_acquired = 'ADV'
                elif (tag_acquired.startswith('E')):
                    tag_acquired = 'ADP'
                elif (tag_acquired.startswith('O')):
                    tag_acquired = 'CCONJ'
                elif (tag_acquired.startswith('T')):
                    tag_acquired = 'PART'
                elif (tag_acquired.startswith('J')):
                    tag_acquired = 'INTJ'
                elif (tag_acquired.startswith('W')
                      or tag_acquired.startswith(':r')):
                    tag_acquired = 'PROPN'
                elif (tag_acquired.startswith('Z')
                      or tag_acquired.startswith('Q')
                      or tag_acquired.startswith('%')
                      or tag_acquired.startswith('0')
                      or tag_acquired.startswith(':q')
                      or tag_acquired.startswith('#')
                      or tag_acquired.startswith('?')):
                    tag_acquired = 'X'
                else:
                    tag_acquired = 'X'
            if (tag_acquired == data[index][0][1]):
                correct = correct + 1
                correct_by_part.append(data[index][0][1])
            total = total + 1
            total_by_part.append(data[index][0][1])
            true_pred_dataset.loc[index] = [data[index][0][1], tag_acquired]
        final_dataset = pd.DataFrame(columns=['tok', 'true', 'pred'])
        for index, sequence in enumerate(data):
            final_dataset[index] = [
                data[index][0][0], data[index][0][1],
                true_pred_dataset.loc[index]['pred']
            ]
        final_dataset.to_csv(folder + "\\" + 'res.csv', index=False)
        accuracy(correct_by_part, total_by_part, correct, total)
        build_confusion_matrices(true_pred_dataset)
    except treetaggerwrapper.TreeTaggerError as e:
        print(e)
 def hybrid_accuracy_score_with_classification(self, data_test, data_train,
                                               folder, grammage,
                                               register_change):
     with open(folder + "\\" + grammage + "grams.pkl", 'rb') as f:
         final_dictionary = pickle.load(f)
     tags_gram = []
     tags_hmm = []
     tags_golden = []
     for index, sequence in enumerate(data_train):
         predicted_tags = self.viterbi(list(map(lambda x: x[0], sequence)))
         tag_hmm = ' '.join([str(self.tags[tag]) for tag in predicted_tags])
         tag_changed = False
         if (int(register_change) == 0):
             analyzed_token = sequence[0][0]
         else:
             analyzed_token = sequence[0][0].lower()
         for name in final_dictionary.keys():
             if (re.search(final_dictionary[name][0], analyzed_token) or
                     re.search(final_dictionary[name][1], analyzed_token)):
                 tag_gram = name
                 tag_changed = True
         if not tag_changed:
             tag_gram = tag_hmm
         tags_gram.append(tag_gram)
         tags_hmm.append(tag_hmm)
         tags_golden.append(data_train[index][0][1])
     counter = 0
     quantified_tags = {}
     for tag in list(set(tags_golden)):
         quantified_tags[tag] = counter
         counter = counter + 1
     inverted_tags = {v: k for k, v in quantified_tags.items()}
     dataset = pd.DataFrame(columns=['HMM', 'GRAM', 'RES'])
     for i in range(len(tags_golden)):
         dataset.loc[i] = [
             quantified_tags[tags_hmm[i]], quantified_tags[tags_gram[i]],
             quantified_tags[tags_golden[i]]
         ]
     y = dataset.iloc[:, 2]
     y = y.astype('int')
     X = dataset.iloc[:, :2]
     X = np.array(X.values.tolist())
     ETR = ExtraTreesRegressor(n_estimators=200,
                               max_depth=8,
                               random_state=0)
     ETR.fit(X, y)
     correct = 0
     total = 0
     correct_by_part = []
     total_by_part = []
     true_pred_dataset = pd.DataFrame(columns=['tok', 'true', 'pred'])
     for index, sequence in enumerate(data_test):
         predicted_tags = self.viterbi(list(map(lambda x: x[0], sequence)))
         tag_hmm = ' '.join([str(self.tags[tag]) for tag in predicted_tags])
         tag_changed = False
         if (int(register_change) == 0):
             analyzed_token = sequence[0][0]
         else:
             analyzed_token = sequence[0][0].lower()
         for name in final_dictionary.keys():
             if (re.search(final_dictionary[name][0], analyzed_token) or
                     re.search(final_dictionary[name][1], analyzed_token)):
                 tag_gram = name
                 tag_changed = True
         if tag_changed:
             tag_final = inverted_tags[round(
                 ETR.predict(
                     np.array([[
                         quantified_tags[tag_hmm], quantified_tags[tag_gram]
                     ]]))[0])]
             if (tag_final == data_test[index][0][1]):
                 correct = correct + 1
                 correct_by_part.append(data_test[index][0][1])
             true_pred_dataset.loc[index] = [
                 data_test[index][0][0], data_test[index][0][1], tag_final
             ]
         else:
             if (tag_hmm == data_test[index][0][1]):
                 correct = correct + 1
                 correct_by_part.append(data_test[index][0][1])
             true_pred_dataset.loc[index] = [
                 data_test[index][0][0], data_test[index][0][1], tag_hmm
             ]
         total = total + 1
         total_by_part.append(data_test[index][0][1])
     true_pred_dataset.to_csv(folder + "\\" + 'res.csv', index=False)
     accuracy(correct_by_part, total_by_part, correct, total)
     build_confusion_matrices(true_pred_dataset)
 def hybrid_accuracy_score(self, data, folder, grammage, register_change,
                           start_end_symbols):
     register_change = int(register_change)
     start_end_symbols = int(start_end_symbols)
     if grammage == 'double_3_and_4':
         with open(folder + "\\3grams.pkl", 'rb') as f:
             three_gram_dictionary = pickle.load(f)
         with open(folder + "\\4grams.pkl", 'rb') as f:
             four_gram_dictionary = pickle.load(f)
     else:
         with open(folder + "\\" + grammage + "grams.pkl", 'rb') as f:
             final_dictionary = pickle.load(f)
     correct = 0
     total = 0
     correct_by_part = []
     total_by_part = []
     true_pred_dataset = pd.DataFrame(columns=['token', 'true', 'pred'])
     overall_positions = []
     verb_positions = []
     adj_positions = []
     x_positions = []
     overall_positions_end = []
     verb_positions_end = []
     adj_positions_end = []
     x_positions_end = []
     for index, sequence in enumerate(data):
         if is_frag(sequence[0][0]):
             word_tagged = ' '.join(map(lambda x: x[0], sequence))
             tag_acquired = 'FRAG'
         elif is_punct(sequence[0][0]):
             word_tagged = ' '.join(map(lambda x: x[0], sequence))
             tag_acquired = 'PUNCT'
         elif is_digit(sequence[0][0]):
             word_tagged = ' '.join(map(lambda x: x[0], sequence))
             tag_acquired = 'DIGIT'
         else:
             predicted_tags = self.viterbi(
                 list(map(lambda x: x[0], sequence)))
             tag_acquired = ' '.join(
                 [str(self.tags[tag]) for tag in predicted_tags])
             found_with_ngram = False
             if (register_change == 0):
                 analyzed_token = sequence[0][0]
             else:
                 if (start_end_symbols == 0):
                     analyzed_token = sequence[0][0].lower()
                 else:
                     analyzed_token = "#" + sequence[0][0].lower() + "#"
             if ((grammage == '3') and (register_change == 1)
                     and (start_end_symbols == 1)):
                 if (re.search(final_dictionary['AUX'][0], analyzed_token)
                         or re.search(final_dictionary['AUX'][1],
                                      analyzed_token)):
                     tag_acquired = 'AUX'
                 if (re.search(final_dictionary['X'][0], analyzed_token)
                         or re.search(final_dictionary['X'][1],
                                      analyzed_token)):
                     tag_acquired = 'X'
                 if (re.search(final_dictionary['SCONJ'][0], analyzed_token)
                         or re.search(final_dictionary['SCONJ'][1],
                                      analyzed_token)):
                     tag_acquired = 'SCONJ'
                 if (re.search(final_dictionary['PROPN'][0], analyzed_token)
                         or re.search(final_dictionary['PROPN'][1],
                                      analyzed_token)):
                     tag_acquired = 'PROPN'
                 if (re.search(final_dictionary['ADJ'][0], analyzed_token)
                         or re.search(final_dictionary['ADJ'][1],
                                      analyzed_token)):
                     tag_acquired = 'ADJ'
                 #if (re.search(final_dictionary['ADV'][0], analyzed_token) or re.search(final_dictionary['ADV'][1], analyzed_token)):
                 #tag_acquired = 'ADV'
                 if (re.search(final_dictionary['PRON'][0], analyzed_token)
                         or re.search(final_dictionary['PRON'][1],
                                      analyzed_token)):
                     tag_acquired = 'PRON'
             elif ((grammage == '3') and ((register_change == 1) and
                                          (start_end_symbols == 0))):
                 if (re.search(final_dictionary['VERB'][0], analyzed_token)
                         or re.search(final_dictionary['VERB'][1],
                                      analyzed_token)):
                     tag_acquired = 'VERB'
                     found_with_ngram = True
                 if (re.search(final_dictionary['ADJ'][0], analyzed_token)
                         or re.search(final_dictionary['ADJ'][1],
                                      analyzed_token)):
                     tag_acquired = 'ADJ'
                     found_with_ngram = True
                 #if (re.search(final_dictionary['ADV'][0], analyzed_token) or re.search(final_dictionary['ADV'][1], analyzed_token)):
                 #tag_acquired = 'ADV'
                 if (re.search(final_dictionary['X'][0], analyzed_token)
                         or re.search(final_dictionary['X'][1],
                                      analyzed_token)):
                     tag_acquired = 'X'
                     found_with_ngram = True
             elif ((grammage == '3') and ((register_change == 0) and
                                          (start_end_symbols == 0))):
                 if (re.search(final_dictionary['VERB'][0], analyzed_token)
                         or re.search(final_dictionary['VERB'][1],
                                      analyzed_token)):
                     tag_acquired = 'VERB'
                 if (re.search(final_dictionary['ADJ'][0], analyzed_token)
                         or re.search(final_dictionary['ADJ'][1],
                                      analyzed_token)):
                     tag_acquired = 'ADJ'
                 #if (re.search(final_dictionary['ADV'][0], analyzed_token) or re.search(final_dictionary['ADV'][1], analyzed_token)):
                 #tag_acquired = 'ADV'
                 if (re.search(final_dictionary['X'][0], analyzed_token)
                         or re.search(final_dictionary['X'][1],
                                      analyzed_token)):
                     tag_acquired = 'X'
             elif grammage == '4':
                 if (re.search(final_dictionary['VERB'][0], analyzed_token)
                         or re.search(final_dictionary['VERB'][1],
                                      analyzed_token)):
                     tag_acquired = 'VERB'
                 if (re.search(final_dictionary['ADJ'][0], analyzed_token)
                         or re.search(final_dictionary['ADJ'][1],
                                      analyzed_token)):
                     tag_acquired = 'ADJ'
                 #if (re.search(final_dictionary['ADV'][0], analyzed_token) or re.search(final_dictionary['ADV'][1], analyzed_token)):
                 #tag_acquired = 'ADV'
                 #if (re.search(final_dictionary['PRON'][0], analyzed_token) or re.search(final_dictionary['PRON'][1], analyzed_token)):
                 #tag_acquired = 'PRON'
                 if (re.search(final_dictionary['X'][0], analyzed_token)
                         or re.search(final_dictionary['X'][1],
                                      analyzed_token)):
                     tag_acquired = 'X'
             elif grammage == 'double_3_and_4':
                 if (re.search(four_gram_dictionary['VERB'][0],
                               analyzed_token)
                         or re.search(four_gram_dictionary['VERB'][1],
                                      analyzed_token)):
                     tag_acquired = 'VERB'
                 if (re.search(three_gram_dictionary['ADJ'][0],
                               analyzed_token)
                         or re.search(three_gram_dictionary['ADJ'][1],
                                      analyzed_token)):
                     tag_acquired = 'ADJ'
                 #if (re.search(three_gram_dictionary['ADV'][0], analyzed_token) or re.search(three_gram_dictionary['ADV'][1], analyzed_token)):
                 #tag_acquired = 'ADV'
                 #if (re.search(four_gram_dictionary['PRON'][0], analyzed_token) or re.search(four_gram_dictionary['PRON'][1], analyzed_token)):
                 #tag_acquired = 'PRON'
                 if (re.search(three_gram_dictionary['X'][0],
                               analyzed_token)
                         or re.search(three_gram_dictionary['X'][1],
                                      analyzed_token)):
                     tag_acquired = 'X'
         if (tag_acquired == data[index][0][1]):
             correct = correct + 1
             correct_by_part.append(data[index][0][1])
             if found_with_ngram:
                 if tag_acquired == 'VERB':
                     if (analyzed_token.find(final_dictionary['VERB'][0]) !=
                             -1):
                         verb_positions.append(
                             analyzed_token.find(
                                 final_dictionary['VERB'][0]))
                         verb_positions_end.append(
                             len(analyzed_token) - analyzed_token.find(
                                 final_dictionary['VERB'][0]) +
                             int(grammage) - 1)
                         overall_positions.append(
                             analyzed_token.find(
                                 final_dictionary['VERB'][0]))
                         overall_positions_end.append(
                             len(analyzed_token) - analyzed_token.find(
                                 final_dictionary['VERB'][0]) +
                             int(grammage) - 1)
                     else:
                         verb_positions.append(
                             analyzed_token.find(
                                 final_dictionary['VERB'][1]))
                         verb_positions_end.append(
                             len(analyzed_token) - analyzed_token.find(
                                 final_dictionary['VERB'][1]) +
                             int(grammage) - 1)
                         overall_positions.append(
                             analyzed_token.find(
                                 final_dictionary['VERB'][1]))
                         overall_positions_end.append(
                             len(analyzed_token) - analyzed_token.find(
                                 final_dictionary['VERB'][1]) +
                             int(grammage) - 1)
                 elif tag_acquired == 'ADJ':
                     if (analyzed_token.find(final_dictionary['ADJ'][0]) !=
                             -1):
                         adj_positions.append(
                             analyzed_token.find(
                                 final_dictionary['ADJ'][0]))
                         adj_positions_end.append(
                             len(analyzed_token) - analyzed_token.find(
                                 final_dictionary['ADJ'][0]) +
                             int(grammage) - 1)
                         overall_positions.append(
                             analyzed_token.find(
                                 final_dictionary['ADJ'][0]))
                         overall_positions_end.append(
                             len(analyzed_token) - analyzed_token.find(
                                 final_dictionary['ADJ'][0]) +
                             int(grammage) - 1)
                     else:
                         adj_positions.append(
                             analyzed_token.find(
                                 final_dictionary['ADJ'][1]))
                         adj_positions_end.append(
                             len(analyzed_token) - analyzed_token.find(
                                 final_dictionary['ADJ'][1]) +
                             int(grammage) - 1)
                         overall_positions.append(
                             analyzed_token.find(
                                 final_dictionary['ADJ'][1]))
                         overall_positions_end.append(
                             len(analyzed_token) - analyzed_token.find(
                                 final_dictionary['ADJ'][1]) +
                             int(grammage) - 1)
                 elif tag_acquired == 'X':
                     if (analyzed_token.find(final_dictionary['X'][0]) !=
                             -1):
                         x_positions.append(
                             analyzed_token.find(final_dictionary['X'][0]))
                         x_positions_end.append(
                             len(analyzed_token) -
                             analyzed_token.find(final_dictionary['X'][0]) +
                             int(grammage) - 1)
                         overall_positions.append(
                             analyzed_token.find(final_dictionary['X'][0]))
                         overall_positions_end.append(
                             len(analyzed_token) -
                             analyzed_token.find(final_dictionary['X'][0]) +
                             int(grammage) - 1)
                     else:
                         x_positions.append(
                             analyzed_token.find(final_dictionary['X'][1]))
                         x_positions_end.append(
                             len(analyzed_token) -
                             analyzed_token.find(final_dictionary['X'][1]) +
                             int(grammage) - 1)
                         overall_positions.append(
                             analyzed_token.find(final_dictionary['X'][1]))
                         overall_positions_end.append(
                             len(analyzed_token) -
                             analyzed_token.find(final_dictionary['X'][1]) +
                             int(grammage) - 1)
         true_pred_dataset.loc[index] = [
             data[index][0][0], data[index][0][1], tag_acquired
         ]
         total = total + 1
         total_by_part.append(data[index][0][1])
     true_pred_dataset.to_csv(folder + "\\" + 'res.csv', index=False)
     accuracy(correct_by_part, total_by_part, correct, total)
     build_confusion_matrices(true_pred_dataset)
     verb_positions_mean = np.mean(verb_positions)
     adj_positions_mean = np.mean(adj_positions)
     x_positions_mean = np.mean(x_positions)
     overall_positions_mean = np.mean(overall_positions)
     verb_positions_end_mean = np.mean(verb_positions_end)
     adj_positions_end_mean = np.mean(adj_positions_end)
     x_positions_end_mean = np.mean(x_positions_end)
     overall_positions_end_mean = np.mean(overall_positions_end)
     print(
         f'Adjective definitive ngram mean position: {adj_positions_mean}\nVerb definitive ngram mean position: {verb_positions_mean}\nX definitive ngram mean position: {x_positions_mean}\nDefinitive ngram mean position: {overall_positions_mean}'
     )
     print(
         f'Adjective definitive ngram from end mean position: {adj_positions_end_mean}\nVerb definitive ngram from end mean position: {verb_positions_end_mean}\nX definitive ngram from end mean position: {x_positions_end_mean}\nDefinitive ngram from end mean position: {overall_positions_end_mean}'
     )
Ejemplo n.º 5
0
def n_gram_test(data, folder, grammage, register_change, start_end_symbols,
                length):
    test_dataset = pd.DataFrame(columns=['WORD', 'TAG'])
    raw_data = open(data, encoding='utf8').readlines()
    counter = 0
    start_end_symbols = int(start_end_symbols)
    for instance in raw_data:
        if (instance[0] != "#" and instance.strip()):
            cols = instance.split('\t')
            if (int(register_change) == 0):
                test_dataset.loc[counter] = [cols[1], cols[3]]
            else:
                if (start_end_symbols == 0):
                    test_dataset.loc[counter] = [cols[1].lower(), cols[3]]
                else:
                    test_dataset.loc[counter] = [
                        "#" + cols[1].lower() + "#", cols[3]
                    ]
            counter = counter + 1
    with open(folder + "\\" + grammage + "grams.pkl", 'rb') as f:
        final_dictionary = pickle.load(f)
    if (length == 1):
        with open(folder + "\\length_" + grammage + 'grams.pkl', 'rb') as f:
            by_length_dictionary = pickle.load(f)
    correct = 0
    total = 0
    correct_by_part = []
    total_by_part = []
    true_pred_dataset = pd.DataFrame(columns=['tok', 'true', 'pred'])
    for index, row in test_dataset.iterrows():
        key_found = False
        for key in final_dictionary.keys():
            if re.search(final_dictionary[key][0], row['WORD']):
                if key == row['TAG']:
                    correct = correct + 1
                    correct_by_part.append(key)
                key_found = True
                true_pred_dataset.loc[index] = [row['WORD'], row['TAG'], key]
                break
            elif re.search(final_dictionary[key][1], row['WORD']):
                if key == row['TAG']:
                    correct = correct + 1
                    correct_by_part.append(key)
                key_found = True
                true_pred_dataset.loc[index] = [row['WORD'], row['TAG'], key]
                break
            elif length == 1:
                if len(row['WORD']) == by_length_dictionary['CCONJ']:
                    if row['TAG'] == 'CCONJ':
                        correct = correct + 1
                        correct_by_part.append(key)
                    key_found = True
                    true_pred_dataset.loc[index] = [
                        row['WORD'], row['TAG'], key
                    ]
                    break
                elif len(row['WORD']) == by_length_dictionary['ADP']:
                    if row['TAG'] == 'ADP':
                        correct = correct + 1
                        correct_by_part.append(key)
                    key_found = True
                    true_pred_dataset.loc[index] = [
                        row['WORD'], row['TAG'], key
                    ]
                    break
                elif len(row['WORD']) == by_length_dictionary['VERB']:
                    if row['TAG'] == 'VERB':
                        correct = correct + 1
                        correct_by_part.append(key)
                    key_found = True
                    true_pred_dataset.loc[index] = [
                        row['WORD'], row['TAG'], key
                    ]
                    break
        if not key_found:
            if row['TAG'] == 'VERB':
                correct = correct + 1
                correct_by_part.append(key)
            true_pred_dataset.loc[index] = [row['WORD'], row['TAG'], 'VERB']
        total = total + 1
        total_by_part.append(key)
    true_pred_dataset.to_csv(folder + "\\" + 'res.csv', index=False)
    accuracy(correct_by_part, total_by_part, correct, total)
    build_confusion_matrices(true_pred_dataset)