def create_cats_overview(
        gold_data_container: GoldDataContainer, root_coding_node: CodingNode,
        article_annotated_list: List[ArticleAnnotated]) -> GoldDataContainer:

    # TODO : remove this once sure it's not needed anymore
    def save_cats_hierarchy_into_dict(current_coding_node: CodingNode):

        cats_dict = {}

        for c in current_coding_node.children:
            cats_dict.update(save_cats_hierarchy_into_dict(c))

        return {current_coding_node.coding_value: cats_dict}

    # TODO : remove this once sure it's not needed anymore
    def save_cats_dict_into_list(cat_dict):

        current_leafs_list = []

        for k, v in cat_dict.items():

            if v != {}:

                current_leafs_list.extend(save_cats_dict_into_list(v))

            else:

                current_leafs_list.append(k)

        return current_leafs_list

    def filter_out_unused_cats(
        root_coding_node: CodingNode,
        article_annotated_list: List[ArticleAnnotated],
    ) -> List[str]:

        len_all_aa_list = len(article_annotated_list)
        all_cats_used_list = []

        for cn in root_coding_node.get_all_subnodes():

            len_aa_set = len(cn.article_annotated_set)

            if len_aa_set != 0 and len_aa_set != len_all_aa_list:

                if cn.coding_value in all_cats_used_list:

                    raise Exception(
                        "Category was already added to this list. Such redundancies could interfer later with training "
                        "where categories are used as keys for dictionaries.")

                all_cats_used_list.append(cn.coding_value)

        return all_cats_used_list

    gold_data_container.cats_list = filter_out_unused_cats(
        root_coding_node, article_annotated_list)

    return gold_data_container
Example #2
0
def run():

    eval_data_container = main.load_gold_data(ConfigLoadG8)
    eval_data_container = main.transform_gold_data(ConfigLoadG8, eval_data_container)

    modelVR = main.init_trainer(ConfigLoadVRModel)

    main.log_manager.info_global(
        "--------------------------------\n"
        "Evaluating mo11 over the entire dataset g8: \n"
    )
    scores_spacy, scores_manual = modelVR.evaluate(eval_data_container)

    # only look at those examples that mo9 predicts as either AF=SM or AF=SC
    modelAF = main.init_trainer(ConfigLoadAFModel)

    gdis_to_keep = []

    for gdi in eval_data_container.gold_data_item_list: 
    
        doc = modelAF.nlp(gdi.text)

        for cat in ['AF: Social Companions', 'AF: Soziale Medien']: 
            if doc.cats[cat] > 0.5: 
                gdis_to_keep.append(gdi)
                break 

    eval_data_container2 = GoldDataContainer()
    eval_data_container2.cats_list = eval_data_container.cats_list
    eval_data_container2.gold_data_item_list = gdis_to_keep

    main.log_manager.info_global(
        "--------------------------------\n"
        "Evaluating mo11 over those texts in g8 that mo9 predicts to be AF=SM or AF=SC: \n"
    )
    scores_spacy2, scores_manual2 = modelVR.evaluate(eval_data_container2)

    # only look at those examples that were annotated as AF=SM or AF=SC
    
    # we need to reload the data to undo the transformation that removes AF
    eval_data_container = main.load_gold_data(ConfigLoadG8)

    gdis_to_keep = [] 

    for gdi in eval_data_container.gold_data_item_list: 

        for cat in ['AF: Social Companions', 'AF: Soziale Medien']:
            if gdi.cats[cat] == 1:
                gdis_to_keep.append(gdi)
                break 

    eval_data_container3 = GoldDataContainer()
    eval_data_container3.cats_list = eval_data_container.cats_list
    eval_data_container3.gold_data_item_list = gdis_to_keep

    # now apply the transformation that removes all categories except VR
    eval_data_container3 = main.transform_gold_data(ConfigLoadG8, eval_data_container3) 

    main.log_manager.info_global(
        "--------------------------------\n"
        "Evaluating mo11 over those texts in g8 that were annotated as AF=SM or AF=SC: \n"
    )
    scores_spacy3, scores_manual3 = modelVR.evaluate(eval_data_container3)

    embed()