def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold, code_freq, training_opt):

    # Start Training
    # Start Training
    print("Fold %i Training code" % fold)

    # For training
    td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq)
    vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq)

    model_filename = models_folder + "/" + "%i_%s__%s" % (fold, "most_freq_code", str(randint(0, 9999999)))

    model = CRFTagger(feature_func=comp_feat_extactor, verbose=False, training_opt=training_opt)
    model.train(td_sents, model_filename)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags)
    vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags)

    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags)
    os.remove(model_filename)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):
    td_sents_by_code = to_tagged_sentences_by_code(essays_TD, regular_tags)
    vd_sents_by_code = to_tagged_sentences_by_code(essays_VD, regular_tags)

    wd_td_ys_bytag = dict()
    wd_vd_ys_bytag = dict()
    td_wd_predictions_by_code = dict()
    vd_wd_predictions_by_code = dict()

    for code in sorted(regular_tags):
        print("Fold %i Training code: %s" % (fold, code))
        td, vd = td_sents_by_code[code], vd_sents_by_code[code]

        model_filename = models_folder + "/" + "%i_%s__%s" % (fold, code, str(randint(0, 9999999)))

        # documentation: http://www.chokkan.org/software/crfsuite/manual.html
        training_opt = {"feature.possible_states": False,
                        "feature.possible_transitions": False,
                        "c2": 2.0
                        }
        model = CRFTagger(feature_func=comp_feat_extactor, verbose=False, training_opt=training_opt)
        model.train(td, model_filename)

        wd_td_ys_bytag[code] = to_flattened_binary_tags(td)
        wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd)

        td_predictions = model.tag_sents(to_sentences(td))
        vd_predictions = model.tag_sents(to_sentences(vd))
        # Delete model file now predictions obtained
        # Note, we are randomizing name above, so we need to clean up here
        os.remove(model_filename)

        td_wd_predictions_by_code[code] = to_flattened_binary_tags(td_predictions)
        vd_wd_predictions_by_code[code] = to_flattened_binary_tags(vd_predictions)
    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
Ejemplo n.º 3
0
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):
    td_sents_by_code = to_tagged_sentences_by_code(essays_TD, regular_tags)
    vd_sents_by_code = to_tagged_sentences_by_code(essays_VD, regular_tags)

    wd_td_ys_bytag = dict()
    wd_vd_ys_bytag = dict()
    td_wd_predictions_by_code = dict()
    vd_wd_predictions_by_code = dict()

    for code in sorted(regular_tags):
        print("Fold %i Training code: %s" % (fold, code))
        td, vd = td_sents_by_code[code], vd_sents_by_code[code]

        model_filename = models_folder + "/" + "%i_%s__%s" % (fold, code, str(randint(0, 9999999)))

        # documentation: http://www.chokkan.org/software/crfsuite/manual.html
        model = CRFTagger(feature_func=comp_feat_extactor, verbose=False)
        model.train(td, model_filename)

        wd_td_ys_bytag[code] = to_flattened_binary_tags(td)
        wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd)

        td_predictions = model.tag_sents(to_sentences(td))
        vd_predictions = model.tag_sents(to_sentences(vd))
        # Delete model file now predictions obtained
        # Note, we are randomizing name above, so we need to clean up here
        os.remove(model_filename)

        td_wd_predictions_by_code[code] = to_flattened_binary_tags(td_predictions)
        vd_wd_predictions_by_code[code] = to_flattened_binary_tags(vd_predictions)
    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
Ejemplo n.º 4
0
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold,
                            training_opt):

    # Start Training
    print("Fold %i Training code" % fold)

    # For training
    td_sents = to_label_powerset_tagged_sentences(essays_TD, regular_tags)
    vd_sents = to_label_powerset_tagged_sentences(essays_VD, regular_tags)

    model_filename = models_folder + "/" + "%i_%s__%s" % (
        fold, "power_set", str(randint(0, 9999999)))

    model = CRFTagger(feature_func=comp_feat_extactor,
                      verbose=False,
                      training_opt=training_opt)
    model.train(td_sents, model_filename)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(
        td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(
        vd_predictions, regular_tags)

    os.remove(model_filename)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):

    # Start Training
    print("Fold %i Training code" % fold)

    # Important - only compute code frequency from training data (NO CHEATING)
    code_freq = tally_code_frequencies(essays_TD)

    # For training
    td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags,
                                                    code_freq)
    vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags,
                                                    code_freq)

    model_filename = models_folder + "/" + "%i_%s__%s" % (
        fold, "most_freq_code", str(randint(0, 9999999)))

    model = CRFTagger(feature_func=comp_feat_extactor, verbose=False)
    model.train(td_sents, model_filename)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags)
    vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags)

    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset,
                                                      regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset,
                                                      regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(
        td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(
        vd_predictions, regular_tags)
    os.remove(model_filename)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
Ejemplo n.º 6
0
class CRF:
    def __init__(self, config):
        self.word_ftrs = config['word_ftrs']
        for ftr in self.word_ftrs:
            if ftr not in CRF.WORD_FTRS:
                raise Exception(
                    'Unknown feature {}. See CRF.WORD_FTRS for supported ones.'
                    .format(CRF.WORD_FTRS))
        self.stc_ftrs = config['stc_ftrs']
        for ftr in self.stc_ftrs:
            if ftr not in CRF.STC_FTRS:
                raise Exception(
                    'Unknown feature {}. See CRF.STC_FTRS for supported ones.'.
                    format(CRF.STC_FTRS))
        self.words_ids = config['extr_word_idx']

    def prep_data(self, file='data/HaaretzOrnan_annotated.txt'):
        self.data = []
        with codecs.open(file, encoding='utf-8') as f:
            lines = f.readlines()
            self.data.append([])
            for line in lines:
                line = line.rstrip()
                # Start new sentence
                if line.startswith(u'#'):
                    continue
                if len(line) == 0:
                    if len(self.data[-1]) > 0:
                        self.data.append([])
                    continue
                # Append word to last sentence
                w = line.split(u' ')[3]
                w = w.replace(u'-', u'')
                self.data[-1].append(w)

        # If sentence is empty - remove it
        if len(self.data[-1]) == 0:
            self.data.remove(self.data[-1])
        return self

    def shuffle(self, seed=None):
        # Shuffle based on seed
        inds = np.arange(len(self.data))
        np.random.seed(seed)
        np.random.shuffle(inds)
        self.data = [self.data[i] for i in inds]
        return self

    def split(self, valid_ratio=0.1):
        # Split to train and validation based on ratio.
        # If ratio is 0 use all data for training
        num_train = int(len(self.data) * (1 - valid_ratio))
        self.train_set = self.data[:num_train]
        self.valid_set = None if valid_ratio == 0 else self.data[num_train:]
        return self

    def train(self, load_model=None):
        train_set = CRF._fin_data_prep(self.train_set)
        _extract_ftr = self._gen_ftr_func()
        self.model = CRFTagger(_extract_ftr,
                               verbose=False,
                               training_opt={
                                   "num_memories": 500,
                                   "delta": 1e-8
                               })
        self.model.train(train_set, 'stc_crf_model')
        return self

    def eval(self):
        conf_mat = np.zeros((len(CRF.VOWELS), len(CRF.VOWELS)))
        valid_set = CRF._fin_data_prep(self.valid_set)
        valid_stc_cons = [[x[0] for x in w] for w in valid_set]
        valid_stc_vowel = [[x[1] for x in w] for w in valid_set]
        predicted = self.model.tag_sents(valid_stc_cons)
        predicted = [[x[1] for x in w] for w in predicted]
        for w_ind in range(len(predicted)):
            for vow_ind, pred_vow in enumerate(predicted[w_ind]):
                conf_mat[self.VOWELS_IDX[pred_vow],
                         self.VOWELS_IDX[valid_stc_vowel[w_ind][vow_ind]]] += 1
        return conf_mat

    def predict(self, pred_set):
        data = []
        for sent in pred_set:
            sent_cons = u' '.join(sent)
            for i, w in enumerate(sent):
                w_cons = list(w)
                w_pos = [i] * len(w)
                unif_sent = [sent_cons] * len(w)
                d = list(zip(w_cons, w_pos, unif_sent))
                data.append(d)
        pred = self.model.tag_sents(data)
        result = []
        word_idx = 0
        for sent in pred_set:
            result.append([])
            for word in sent:
                pred_smpl = pred[word_idx]
                w = ''.join([entry[0][0] + entry[-1] for entry in pred_smpl])
                result[-1].append(w)
                word_idx += 1
        return result

    @staticmethod
    def _fin_data_prep(data_set):
        data = []
        for sent in data_set:
            sent_cons = u' '.join([x[::2] for x in sent])
            for i, w in enumerate(sent):
                w_cons = list(w[::2])
                w_pos = [i] * len(w[::2])
                unif_sent = [sent_cons] * len(w[::2])
                d = list(zip(w_cons, w_pos, unif_sent))
                data.append(list(zip(d, list(w[1::2]))))
        return data

    @staticmethod
    def _len(x):
        return len(x) if isinstance(x, str) else int(x)

    VOWELS = [u'a', u'e', u'u', u'i', u'o', u'*']
    VOWELS_IDX = {x: i for i, x in enumerate(VOWELS)}
    WORD_FTRS = [
        'IS_FIRST', 'IS_LAST', 'IDX', 'VAL', 'PRV_VAL', 'NXT_VAL', 'FRST_VAL',
        'LST_VAL', 'SCND_VAL', 'SCND_LST_VAL', 'LEN'
    ]
    STC_FTRS = ['IS_FIRST', 'IS_LAST', 'IDX']

    def _gen_ftr_func(self):
        # Closure
        def _extract_ftr(tokens, i):
            def _extract_wrd_ftr(tokens, i, suff):
                feature_list = []

                if i is not None:
                    if 'IS_FIRST' in self.word_ftrs:
                        feature_list.append("is_first{}={}".format(
                            suff, 1 if i == 0 else 0))

                    if 'IS_LAST' in self.word_ftrs:
                        feature_list.append("is_last{}={}".format(
                            suff, 1 if i == (len(tokens) - 1) else 0))

                    if 'IDX' in self.word_ftrs:
                        feature_list.append("pos{}={}".format(suff, i))

                    if 'VAL' in self.word_ftrs:
                        feature_list.append("cur{}={}".format(suff, tokens[i]))

                    if 'PRV_VAL' in self.word_ftrs:
                        if i > 0:
                            feature_list.append("prev{}={}".format(
                                suff, tokens[i - 1]))

                    if 'NXT_VAL' in self.word_ftrs:
                        if i < (len(tokens) - 1):
                            feature_list.append("next{}={}".format(
                                suff, tokens[i + 1]))

                if 'FRST_VAL' in self.word_ftrs:
                    feature_list.append("first{}={}".format(suff, tokens[0]))

                if 'LST_VAL' in self.word_ftrs:
                    feature_list.append("last{}={}".format(suff, tokens[-1]))

                if 'LEN' in self.word_ftrs:
                    feature_list.append("len{}={}".format(suff, len(tokens)))

                if 'SCND_VAL' in self.word_ftrs:
                    if len(tokens) > 1:
                        feature_list.append("scnd{}={}".format(
                            suff, tokens[1]))

                if 'SCND_LST_VAL' in self.word_ftrs:
                    if len(tokens) > 1:
                        feature_list.append("scnd_last{}={}".format(
                            suff, tokens[-2]))

                return feature_list

            feature_list = []
            word_pos = tokens[0][1]
            sent = tokens[0][2].split(' ')

            # Sentence features
            if 'IS_FIRST' in self.stc_ftrs:
                if word_pos == 0:
                    feature_list.append('FIRST_WORD')
            if 'IS_LAST' in self.stc_ftrs:
                if word_pos == (len(sent) - 1):
                    feature_list.append('LAST_WORD')
            if 'IDX' in self.stc_ftrs:
                feature_list.append("idx=" + str(word_pos))

            # word features
            for rel_pos in self.words_ids:
                word_pos = tokens[0][1] + rel_pos
                if word_pos >= 0 and word_pos < len(sent):
                    word = sent[word_pos]
                    feature_list += _extract_wrd_ftr(
                        word, i if rel_pos == 0 else None,
                        '_w{}'.format(rel_pos))

            return feature_list

        return _extract_ftr
Ejemplo n.º 7
0
class CRF:
    def __init__(self, config):
        self.ftrs = config['ftrs']
        for ftr in self.ftrs:
            if ftr not in CRF.WORD_FTRS:
                raise Exception('Unknown feature {}. See CRF.CONFIG for supported ones.'.format(CRF.WORD_FTRS))

    def prep_data(self, file='data/HaaretzOrnan_annotated.txt'):
        self.data = []
        # print('Preparing data')
        with codecs.open(file, encoding='utf-8') as f:
            lines = f.readlines()
            for line in lines:
                line = line.rstrip()
                if line.startswith(u'#') or len(line) == 0:
                    continue
                w = line.split(u' ')[3]
                w = w.replace(u'-', u'')
                self.data.append(list(zip(list(w[::2]), list(w[1::2]))))
        return self

    def shuffle(self, seed=None):
        # Shuffle based on seed
        inds = np.arange(len(self.data))
        np.random.seed(seed)
        np.random.shuffle(inds)
        self.data=[self.data[i] for i in inds]
        return self

    def split(self, valid_ratio=0.1):
        # Split to train and validation based on ratio.
        # If ratio is 0 use all data for training
        num_train = int(len(self.data)*(1-valid_ratio))
        self.train_set = self.data[:num_train]
        self.valid_set = None if valid_ratio==0 else self.data[num_train:]
        return self

    def train(self, load_model=None):
        _extract_ftr = self._gen_ftr_func()
        self.model = CRFTagger(_extract_ftr, verbose=False,
                       training_opt={"num_memories": 500, "delta": 1e-8})
        self.model.train(self.train_set, 'word_crf_model')
        return self

    def eval(self):
        conf_mat = np.zeros((len(CRF.VOWELS), len(CRF.VOWELS)))
        valid_word_cons = [[x[0] for x in w] for w in self.valid_set]
        valid_word_vowel = [[x[1] for x in w] for w in self.valid_set]
        predicted = self.model.tag_sents(valid_word_cons)
        predicted = [[x[1] for x in w] for w in predicted]
        for w_ind in range(len(predicted)):
            for vow_ind, pred_vow in enumerate(predicted[w_ind]):
                conf_mat[self.VOWELS_IDX[pred_vow], self.VOWELS_IDX[valid_word_vowel[w_ind][vow_ind]]] += 1
        return conf_mat

    def predict(self, pred_set):
        result = []
        for sent in pred_set:
            pred_sent = []
            predicted = self.model.tag_sents(sent)
            for i, w_cons in enumerate(predicted):
                pred_sent.append(''.join(x+y for x, y in w_cons))
            result.append(pred_sent)
        return result

    @staticmethod
    def _len(x):
        return len(x) if isinstance(x, str) else int(x)

    VOWELS = [u'a',u'e',u'u',u'i',u'o',u'*']
    VOWELS_IDX = {x:i for i,x in enumerate(VOWELS)}
    WORD_FTRS = ['IS_FIRST', 'IS_LAST', 'IDX', 'VAL', 'PRV_VAL', 'NXT_VAL', 'FRST_VAL', 'LST_VAL', 'SCND_VAL', 'SCND_LST_VAL', 'LEN']

    def _gen_ftr_func(self):
        # Closure
        def _extract_ftr(tokens, i):
            # print(tokens, i, tokens[i])
            feature_list = []
            if 'IS_FIRST' in self.ftrs:
                feature_list.append("is_first="+str(1 if i == 0 else 0))

            if 'IS_LAST' in self.ftrs:
                feature_list.append("is_last="+str(1 if i == (len(tokens)-1) else 0))

            if 'IDX' in self.ftrs:
                feature_list.append("pos="+str(i))

            if 'VAL' in self.ftrs:
                feature_list.append("cur="+tokens[i])

            if 'PRV_VAL' in self.ftrs:
                if i > 0:
                    feature_list.append("prev="+tokens[i-1])

            if 'NXT_VAL' in self.ftrs:
                if i < (len(tokens)-1):
                    feature_list.append("next="+tokens[i+1])

            if 'FRST_VAL' in self.ftrs:
                feature_list.append("first="+tokens[0])

            if 'LST_VAL' in self.ftrs:
                feature_list.append("last="+tokens[-1])

            if 'LEN' in self.ftrs:
                feature_list.append("len="+str(len(tokens)))

            if 'SCND_VAL' in self.ftrs:
                if len(tokens)>1:
                    feature_list.append("scnd="+tokens[1])

            if 'SCND_LST_VAL' in self.ftrs:
                if len(tokens)>1:
                    feature_list.append("scnd_last="+tokens[-2])

            return feature_list
        return _extract_ftr
Ejemplo n.º 8
0
def main(positive, death):
    ############# Compile the dataset ###############
    ## Load the dataset
    text = list()
    response = list()
    file_path = [positive, death]

    for path in file_path:
        input_file = jsonlines.open(path)
        for obj in input_file:
            text.append(obj['text'])
            response.append(obj['annotation']['part1.Response'])

    ## Tweet Preprocessing
    prep_text = list()
    for i in text:
        prep_text.append(p.clean(i))

    ## Tag Keywords and Create Labels
    ### Focus on verbs--therefore, try lemmatization first
    wnl = WordNetLemmatizer()
    n_corpus = len(prep_text)
    token_data = ["test"] * n_corpus

    n = 0
    for sent in prep_text:
        token_data[n] = [
            wnl.lemmatize(i, j[0].lower())
            if j[0].lower() in ['a', 'n', 'v'] else wnl.lemmatize(i)
            for i, j in pos_tag(word_tokenize(sent))
        ]
        n = n + 1

    ### Create labels
    death_list = ["die", "dead", "death", "pass", "away"]

    n = 0
    for sent in token_data:
        for idx, token in enumerate(sent):
            if ((token.lower() in ["test", "positive", "result"])
                    and (response[n] == ["yes"])):
                sent[idx] = [sent[idx], "P-Yes"]
            elif ((token.lower() in ["test", "positive", "result"])
                  and (response[n] == ["no"])):
                sent[idx] = [sent[idx], "P-No"]
            elif ((token.lower() in death_list) and (response[n] == ["yes"])):
                sent[idx] = [sent[idx], "D-Yes"]
            elif ((token.lower() in death_list) and (response[n] == ["no"])):
                sent[idx] = [sent[idx], "D-No"]
            else:
                sent[idx] = [sent[idx], "Irr"]
        n = n + 1

    ## Shuffle and split into train data and dev data
    token_data = shuffle(token_data, random_state=6)
    train_data, dev_data = train_test_split(token_data,
                                            test_size=0.3,
                                            random_state=616)
    print(
        f"The number of sentences in training data: {len(train_data)}; The number of sentences in dev data: {len(dev_data)};"
    )

    ############# Fit A CRF Model And Predict ###############
    condition_to_func = {
        "base": my_features,
        "include_neighbors": neighbor_features
    }
    for cond, func in condition_to_func.items():
        # initialize
        crf = CRFTagger(feature_func=func)
        crf.train(train_data, 'model.tagger')
        # Test
        crf._feature_func(prep_text[0].split(), 7)
        crf.tag_sents([['I', 'get', 'covid'], ['he', 'test', 'positive']])

        # Output
        filename = cond + "_final_output.tsv"
        with open(filename, 'w') as pred_file:
            for sent in dev_data:
                sent_words = [item[0] for item in sent]
                gold_tags = [item[1] for item in sent]

                with_tags = crf.tag(sent_words)
                for i, output in enumerate(with_tags):
                    original_word, tag_prediction = output
                    line_as_str = f"{original_word}\t{gold_tags[i]}\t{tag_prediction}\n"
                    pred_file.write(line_as_str)
                # add an empty line after each sentence
                pred_file.write("\n")

    ############# Evaluation ###############
    ## Extract Data with Meaning Labels
    cond_list = ['base', 'include_neighbors']

    for cond in cond_list:
        filename = cond + "_final_output.tsv"

        with open(filename) as fd:
            rd = csv.reader(fd, delimiter="\t", quotechar='"')
            D_data = []
            P_data = []
            for row in rd:
                if len(row) > 1:
                    if row[1] in ['P-Yes', 'P-No']:
                        P_data.append(row)
                    elif row[1] in ['D-Yes', 'D-No']:
                        D_data.append(row)

        column_name = ['token', 'label', 'prediction']
        P_df = pd.DataFrame(P_data, columns=column_name)
        D_df = pd.DataFrame(D_data, columns=column_name)
        Total_df = P_df.append(D_df)

        # Accuracy
        ## Overall Accuracy
        T_a = accuracy_score(Total_df['label'], Total_df['prediction'])

        ## Accuracy, Precision, and Recall for two events
        accuracy = []
        precision = []
        recall = []
        for df in [P_df, D_df]:
            accuracy.append(accuracy_score(df['label'], df['prediction']))
            precision.append(
                sum(1 for item in range(0,
                                        len(df) - 1)
                    if ('Yes' in df['label'][item]
                        and 'Yes' in df['prediction'][item])) /
                sum(1 for item in range(0,
                                        len(df) - 1)
                    if ('Yes' in df['prediction'][item])))
            recall.append(
                sum(1 for item in range(0,
                                        len(df) - 1)
                    if ('Yes' in df['label'][item]
                        and 'Yes' in df['prediction'][item])) /
                sum(1 for item in range(0,
                                        len(df) - 1)
                    if ('Yes' in df['label'][item])))

        ## F-1
        f1 = []
        for num in [0, 1]:
            f1.append((2 * precision[num] * recall[num]) /
                      (precision[num] + recall[num]))

        # Report performance
        print("condition: " + cond)
        print(f"Overall Accuracy {T_a:0.03}")
        covid_event = ['Test Positive', 'Death Case']

        num = 0
        for event in covid_event:
            print(
                f"Scores for {event} : \taccuracy {accuracy[num]:0.03}\tprecision {precision[num]:0.03}\trecall {recall[num]:0.03}\tF1 {f1[num]:0.03}"
            )
            num = num + 1

    ## Basicline Performance / Confusion Matrix
    print("Confusion Matrix:")
    print(pd.crosstab(Total_df['label'], Total_df['prediction']))
    print("Training data:")
    labels = ["P-Yes", "P-No", "D-Yes", "D-No"]
    for label in labels:
        train_data2 = np.concatenate(train_data).flat
        n_label = sum(1 for item in train_data2 if item == label)
        print(f"Number of {label}: {n_label}")