Ejemplo n.º 1
0
def persist_gold_data(
    config: Type[ConfigRoot],
    gold_data_container: GoldDataContainer,
):

    log_manager.info_global(
        "--------------------------------"
        "\nPersisting transformed data into json structured for training\n")

    if config.should_do_dummy_run:

        config.gold_data_json_path = config.gold_data_json_path.replace(
            ".json", "__dummy.json")
        gold_data_container.gold_data_item_list = gold_data_container.gold_data_item_list[:
                                                                                          40]

    gold_data_manager.persist_to_json(config.gold_data_json_path,
                                      gold_data_container)
Ejemplo n.º 2
0
    def save_gold_data_into_container(
        gold_data_container: GoldDataContainer,
        article_annotated_list: List[ArticleAnnotated],
    ) -> GoldDataContainer:

        gold_data_container.gold_data_item_list = []

        def get_cats_assigned(article_annotated, cats_list):

            article_cats_dict = {}

            relevant_cats = set(
                coding_dict["coding_node"].coding_value
                for coding_dict in article_annotated.coding_list)

            for cat in cats_list:

                if cat in relevant_cats:

                    article_cats_dict[cat] = 1

                else:

                    article_cats_dict[cat] = 0

            return article_cats_dict

        for article_annotated in article_annotated_list:

            gold_data_container.gold_data_item_list.append(
                GoldDataItem(
                    article_id=article_annotated.article_id,
                    text=article_annotated.article_file_content_cleaned,
                    cats=get_cats_assigned(article_annotated,
                                           gold_data_container.cats_list)))

        return gold_data_container
Ejemplo n.º 3
0
def run():

    eval_data_container = main.load_gold_data(ConfigLoadG8)
    eval_data_container = main.transform_gold_data(ConfigLoadG8, eval_data_container)

    modelVR = main.init_trainer(ConfigLoadVRModel)

    main.log_manager.info_global(
        "--------------------------------\n"
        "Evaluating mo11 over the entire dataset g8: \n"
    )
    scores_spacy, scores_manual = modelVR.evaluate(eval_data_container)

    # only look at those examples that mo9 predicts as either AF=SM or AF=SC
    modelAF = main.init_trainer(ConfigLoadAFModel)

    gdis_to_keep = []

    for gdi in eval_data_container.gold_data_item_list: 
    
        doc = modelAF.nlp(gdi.text)

        for cat in ['AF: Social Companions', 'AF: Soziale Medien']: 
            if doc.cats[cat] > 0.5: 
                gdis_to_keep.append(gdi)
                break 

    eval_data_container2 = GoldDataContainer()
    eval_data_container2.cats_list = eval_data_container.cats_list
    eval_data_container2.gold_data_item_list = gdis_to_keep

    main.log_manager.info_global(
        "--------------------------------\n"
        "Evaluating mo11 over those texts in g8 that mo9 predicts to be AF=SM or AF=SC: \n"
    )
    scores_spacy2, scores_manual2 = modelVR.evaluate(eval_data_container2)

    # only look at those examples that were annotated as AF=SM or AF=SC
    
    # we need to reload the data to undo the transformation that removes AF
    eval_data_container = main.load_gold_data(ConfigLoadG8)

    gdis_to_keep = [] 

    for gdi in eval_data_container.gold_data_item_list: 

        for cat in ['AF: Social Companions', 'AF: Soziale Medien']:
            if gdi.cats[cat] == 1:
                gdis_to_keep.append(gdi)
                break 

    eval_data_container3 = GoldDataContainer()
    eval_data_container3.cats_list = eval_data_container.cats_list
    eval_data_container3.gold_data_item_list = gdis_to_keep

    # now apply the transformation that removes all categories except VR
    eval_data_container3 = main.transform_gold_data(ConfigLoadG8, eval_data_container3) 

    main.log_manager.info_global(
        "--------------------------------\n"
        "Evaluating mo11 over those texts in g8 that were annotated as AF=SM or AF=SC: \n"
    )
    scores_spacy3, scores_manual3 = modelVR.evaluate(eval_data_container3)

    embed()
Ejemplo n.º 4
0
    def save_gold_data_into_container(
        gold_data_container: GoldDataContainer,
        article_annotated_list: List[ArticleAnnotated],
    ):

        cats_list = gold_data_container.cats_list
        gold_data_container.gold_data_item_list = []

        log_manager.info_global(
            "Starting to transform articles with annotations to sentences with annotations.\n"
            "This will take a while.")

        len_article_annotated_list = len(article_annotated_list)
        for i, article_annotated in enumerate(article_annotated_list, start=1):
            if i % 100 == 0 or i % len_article_annotated_list == 0:
                log_manager.info_global(
                    f"at article number: {i}, out of {len_article_annotated_list}"
                )

            sentence_cats_dict = OrderedDict()
            sentence_article_list = sentence_split_func(
                article_annotated.article_file_content_cleaned)

            for sentence in sentence_article_list:

                sentence_cats_dict[str(sentence)] = {
                    cat: 0
                    for cat in cats_list
                }

            for coding in article_annotated.coding_list:

                segment = coding["Segment"]
                segment = re.sub("<.*?>", "", segment)
                sentence_segment_list = sentence_split_func(segment)

                for sentence_segment in sentence_segment_list:

                    for sentence_article in sentence_cats_dict.keys():

                        if str(sentence_segment) in sentence_article:

                            cat = coding["coding_node"].coding_value
                            cat_used_dict = sentence_cats_dict[
                                sentence_article]
                            if cat in cat_used_dict:
                                cat_used_dict[cat] = 1

                            break

            for sentence, cats in sentence_cats_dict.items():

                gold_data_container.gold_data_item_list.append(
                    GoldDataItem(article_id=article_annotated.article_id,
                                 text=sentence,
                                 cats=cats))

        log_manager.info_global(
            f"Transformed {len(article_annotated_list)} articles into {len(gold_data_container.gold_data_item_list)} sentences."
        )

        return gold_data_container