Python GoldDataContainer Examples, etl.gold_data_manager.GoldDataContainer Python Examples

Example #1

0

Show file

File: trainer4.py Project: acdh-oeaw/mara_nlp_suite

    def evaluate_spacy(self, eval_data_container: GoldDataContainer):

        assert self.get_textcat_pipeline().cfg['exclusive_classes'] is not None

        scorer = self.nlp.evaluate(
            eval_data_container.get_in_spacy_format(),
            verbose=False,
        )

        # scorer = spacy.scorer.Scorer(pipeline=self.nlp.pipeline)
        #
        # for ed in eval_data_container.gold_data_item_list:
        #
        #     doc_with_cats = self.nlp(ed.text) # tokenization + predictions
        #
        #     gold = spacy.gold.GoldParse(
        #         self.nlp.make_doc(ed.text), # tokenization only, no predictions
        #         cats=ed.cats # correct categories
        #     )
        #
        #     scorer.score(doc_with_cats, gold, verbose=True)

        self.log_trainer(
            "Spacy's scores: {\n" +
            f"  'textcat_score': {scorer.scores['textcat_score']}\n" +
            "  'textcats_per_cat': {\n" + ''.join([
                f"    '{name}': {value}\n"
                for name, value in scorer.scores['textcats_per_cat'].items()
            ]) + "  }\n" + "}")

        return scorer.scores

Example #2

0

Show file

File: pipe_train_AF.py Project: acdh-oeaw/mara_nlp_suite

def run():

    gdc_1 = main.load_gold_data(Config1_1)
    gdc_1 = main.transform_gold_data(Config1_1, gdc_1)
    gdc_1 = main.transform_gold_data(Config1_2, gdc_1)
    gdc = GoldDataContainer(cats_list=gdc_1.cats_list)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_1)

    gdc_2 = main.load_gold_data(Config2)
    gdc_2 = main.transform_gold_data(Config2, gdc_2)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_2)

    gdc_3 = main.load_gold_data(Config3)
    gdc_3 = main.transform_gold_data(Config3, gdc_3)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_3)

    gdc_4 = main.load_gold_data(Config4)
    gdc_4 = main.transform_gold_data(Config4, gdc_4)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_4)

    gdc_5 = main.load_gold_data(Config5)
    gdc_5 = main.transform_gold_data(Config5, gdc_5)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_5)

    gdc_6 = main.load_gold_data(Config6)
    gdc_6 = main.transform_gold_data(Config6, gdc_6)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_6)

    trainer = main.init_trainer(ConfigTrain, cats_list=gdc.cats_list)
    main.run_training(config=ConfigTrain,
                      trainer=trainer,
                      gold_data_container=gdc)

    embed()

Example #3

0

Show file

File: maxqdata_manager.py Project: acdh-oeaw/mara_nlp_suite

def create_cats_overview(
        gold_data_container: GoldDataContainer, root_coding_node: CodingNode,
        article_annotated_list: List[ArticleAnnotated]) -> GoldDataContainer:

    # TODO : remove this once sure it's not needed anymore
    def save_cats_hierarchy_into_dict(current_coding_node: CodingNode):

        cats_dict = {}

        for c in current_coding_node.children:
            cats_dict.update(save_cats_hierarchy_into_dict(c))

        return {current_coding_node.coding_value: cats_dict}

    # TODO : remove this once sure it's not needed anymore
    def save_cats_dict_into_list(cat_dict):

        current_leafs_list = []

        for k, v in cat_dict.items():

            if v != {}:

                current_leafs_list.extend(save_cats_dict_into_list(v))

            else:

                current_leafs_list.append(k)

        return current_leafs_list

    def filter_out_unused_cats(
        root_coding_node: CodingNode,
        article_annotated_list: List[ArticleAnnotated],
    ) -> List[str]:

        len_all_aa_list = len(article_annotated_list)
        all_cats_used_list = []

        for cn in root_coding_node.get_all_subnodes():

            len_aa_set = len(cn.article_annotated_set)

            if len_aa_set != 0 and len_aa_set != len_all_aa_list:

                if cn.coding_value in all_cats_used_list:

                    raise Exception(
                        "Category was already added to this list. Such redundancies could interfer later with training "
                        "where categories are used as keys for dictionaries.")

                all_cats_used_list.append(cn.coding_value)

        return all_cats_used_list

    gold_data_container.cats_list = filter_out_unused_cats(
        root_coding_node, article_annotated_list)

    return gold_data_container

Example #4

0

Show file

File: maxqdata_manager.py Project: acdh-oeaw/mara_nlp_suite

def transform_to_gold_data_articles(
    root_coding_node: CodingNode,
    article_annotated_list: List[ArticleAnnotated],
) -> GoldDataContainer:

    gold_data_container = GoldDataContainer()

    gold_data_container = create_cats_overview(gold_data_container,
                                               root_coding_node,
                                               article_annotated_list)

    def save_gold_data_into_container(
        gold_data_container: GoldDataContainer,
        article_annotated_list: List[ArticleAnnotated],
    ) -> GoldDataContainer:

        gold_data_container.gold_data_item_list = []

        def get_cats_assigned(article_annotated, cats_list):

            article_cats_dict = {}

            relevant_cats = set(
                coding_dict["coding_node"].coding_value
                for coding_dict in article_annotated.coding_list)

            for cat in cats_list:

                if cat in relevant_cats:

                    article_cats_dict[cat] = 1

                else:

                    article_cats_dict[cat] = 0

            return article_cats_dict

        for article_annotated in article_annotated_list:

            gold_data_container.gold_data_item_list.append(
                GoldDataItem(
                    article_id=article_annotated.article_id,
                    text=article_annotated.article_file_content_cleaned,
                    cats=get_cats_assigned(article_annotated,
                                           gold_data_container.cats_list)))

        return gold_data_container

    gold_data_container = save_gold_data_into_container(
        gold_data_container=gold_data_container,
        article_annotated_list=article_annotated_list,
    )

    return gold_data_container

Example #5

0

Show file

def persist_gold_data(
    config: Type[ConfigRoot],
    gold_data_container: GoldDataContainer,
):

    log_manager.info_global(
        "--------------------------------"
        "\nPersisting transformed data into json structured for training\n")

    if config.should_do_dummy_run:

        config.gold_data_json_path = config.gold_data_json_path.replace(
            ".json", "__dummy.json")
        gold_data_container.gold_data_item_list = gold_data_container.gold_data_item_list[:
                                                                                          40]

    gold_data_manager.persist_to_json(config.gold_data_json_path,
                                      gold_data_container)

Example #6

0

Show file

File: trainer4.py Project: acdh-oeaw/mara_nlp_suite

    def train(self, train_data: GoldDataContainer,
              eval_data: GoldDataContainer, iteration_limit: int):

        start = datetime.now()
        self.log_trainer("--------------------------------"
                         "\nSTART TRAINING\n")

        self.log_trainer(f"model_path: {self.model_path}")
        self.log_trainer(f"train_data_json_path: {self.train_data_json_path}")
        self.log_trainer(f"should_create_model: {self.should_create_model}")
        self.log_trainer(f"should_load_model: {self.should_load_model}")
        self.log_trainer(f"should_persist_model: {self.should_persist_model}")
        self.log_trainer(f"cats: {self.cats}")
        self.log_trainer(f"spacy.prefer_gpu(): {spacy.prefer_gpu()}")
        self.log_trainer(f"iteration_limit: {iteration_limit}")
        self.log_trainer(
            f"len(train_data): {len(train_data.gold_data_item_list)}")
        self.log_trainer(
            f"len(eval_data): {len(eval_data.gold_data_item_list)}")

        # TODO : add hashing of assigned cats
        # TODO : Write cats_list to log too
        hash_texts_train_data = self.get_hash_of_texts(
            [gdi.text for gdi in train_data.gold_data_item_list])
        self.log_trainer(
            f"hash of texts in train_data: {hash_texts_train_data}")
        hash_texts_eval_data = self.get_hash_of_texts(
            [gdi.text for gdi in eval_data.gold_data_item_list])
        self.log_trainer(f"hash of texts in eval_data: {hash_texts_eval_data}")

        textcat = self.get_textcat_pipeline()
        self.log_trainer(
            f"textcat.cfg.get('exclusive_classes', None): {textcat.cfg.get('exclusive_classes', None)}"
        )

        dropout = 0.2
        self.log_trainer(f"dropout: {dropout}")

        other_pipes = [
            pipe for pipe in self.nlp.pipe_names
            if pipe not in ["textcat", "trf_wordpiecer", "trf_tok2vec"]
        ]
        with self.nlp.disable_pipes(*other_pipes):
            optimizer = self.nlp.begin_training()

            for iteration in range(1, iteration_limit + 1):
                losses = {}

                start_iteration = datetime.now()
                self.log_trainer(f"Start iteration: {iteration}")

                for text, annotations in train_data.get_in_spacy_format():

                    self.nlp.update([text], [annotations],
                                    sgd=optimizer,
                                    drop=dropout,
                                    losses=losses)

                end_iteration = datetime.now()
                self.log_trainer(f"End iteration: {iteration}")
                self.log_trainer(
                    f"duration iteration: {end_iteration - start_iteration}")

                self.log_trainer(f"losses: {losses['textcat']}")

                if len(eval_data.gold_data_item_list) > 0:

                    scores, _ = self.evaluate(eval_data)
                    self.log_trainer(
                        f"overall score: {scores['textcat_score']}")
                    # https://github.com/explosion/spaCy/blob/26a90f011b8c21dfc06940579479aaff8006ff74/spacy/scorer.py#L164

                    for cat in scores['textcats_per_cat']:

                        self.log_trainer(
                            f"scores for '{cat}': {scores['textcats_per_cat'][cat]}"
                        )

                if self.should_persist_model:

                    self.persist_model()

        end = datetime.now()
        self.log_trainer("END TRAINING")
        self.log_trainer(f"DURATION TRAINING: {end - start}")

Example #7

0

Show file

def run():

    eval_data_container = main.load_gold_data(ConfigLoadG8)
    eval_data_container = main.transform_gold_data(ConfigLoadG8, eval_data_container)

    modelVR = main.init_trainer(ConfigLoadVRModel)

    main.log_manager.info_global(
        "--------------------------------\n"
        "Evaluating mo11 over the entire dataset g8: \n"
    )
    scores_spacy, scores_manual = modelVR.evaluate(eval_data_container)

    # only look at those examples that mo9 predicts as either AF=SM or AF=SC
    modelAF = main.init_trainer(ConfigLoadAFModel)

    gdis_to_keep = []

    for gdi in eval_data_container.gold_data_item_list: 
    
        doc = modelAF.nlp(gdi.text)

        for cat in ['AF: Social Companions', 'AF: Soziale Medien']: 
            if doc.cats[cat] > 0.5: 
                gdis_to_keep.append(gdi)
                break 

    eval_data_container2 = GoldDataContainer()
    eval_data_container2.cats_list = eval_data_container.cats_list
    eval_data_container2.gold_data_item_list = gdis_to_keep

    main.log_manager.info_global(
        "--------------------------------\n"
        "Evaluating mo11 over those texts in g8 that mo9 predicts to be AF=SM or AF=SC: \n"
    )
    scores_spacy2, scores_manual2 = modelVR.evaluate(eval_data_container2)

    # only look at those examples that were annotated as AF=SM or AF=SC
    
    # we need to reload the data to undo the transformation that removes AF
    eval_data_container = main.load_gold_data(ConfigLoadG8)

    gdis_to_keep = [] 

    for gdi in eval_data_container.gold_data_item_list: 

        for cat in ['AF: Social Companions', 'AF: Soziale Medien']:
            if gdi.cats[cat] == 1:
                gdis_to_keep.append(gdi)
                break 

    eval_data_container3 = GoldDataContainer()
    eval_data_container3.cats_list = eval_data_container.cats_list
    eval_data_container3.gold_data_item_list = gdis_to_keep

    # now apply the transformation that removes all categories except VR
    eval_data_container3 = main.transform_gold_data(ConfigLoadG8, eval_data_container3) 

    main.log_manager.info_global(
        "--------------------------------\n"
        "Evaluating mo11 over those texts in g8 that were annotated as AF=SM or AF=SC: \n"
    )
    scores_spacy3, scores_manual3 = modelVR.evaluate(eval_data_container3)

    embed()

Example #8

0

Show file

def run_2():
    class Config1_1(ConfigRoot):
        # g1 combined with tr2 produces gold data that was formerly persisted as 's1_articles__tr2_1__sc_sm_alle_anwendungsfelder.json'
        gold_data_json_path = data_flow_registry.gold_data["g1"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule2

    class Config1_2(ConfigRoot):
        gold_data_transform_rule = gold_data_transform_rules.TransformRule8

    class Config2(ConfigRoot):
        # formerly s2 in prodigy, now p1 in prodigy data, and persisted as gold data as g4
        gold_data_json_path = data_flow_registry.gold_data["g4"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule9

    class Config3(ConfigRoot):
        # formerly s3 in prodigy, now p2 in prodigy data, and persisted as gold data as g5
        gold_data_json_path = data_flow_registry.gold_data["g5"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule8

    class Config4(ConfigRoot):
        # formerly s4 in prodigy, now p3 in prodigy data, and persisted as gold data as g6
        gold_data_json_path = data_flow_registry.gold_data["g6"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule8

    class Config5(ConfigRoot):
        # formerly s5 in prodigy, now p4 in prodigy data, and persisted as gold data as g7
        gold_data_json_path = data_flow_registry.gold_data["g7"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule8

    class Config6(ConfigRoot):
        # formerly s6 in prodigy, now p5 in prodigy data, and persisted as gold data as g8
        gold_data_json_path = data_flow_registry.gold_data["g8"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule8

    ConfigRoot.gold_data_json_path = "../data/gold_data/s1_articles__tr2_1__sc_sm_alle_anwendungsfelder_X.json"
    gdc_old = main.load_gold_data(ConfigRoot)

    gdc_1 = main.load_gold_data(Config1_1)
    gdc_1 = main.transform_gold_data(Config1_1, gdc_1)
    gdc_1 = main.transform_gold_data(Config1_2, gdc_1)
    gdc = GoldDataContainer(cats_list=gdc_1.cats_list)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_1)

    gdc_2 = main.load_gold_data(Config2)
    gdc_2 = main.transform_gold_data(Config2, gdc_2)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_2)

    gdc_3 = main.load_gold_data(Config3)
    gdc_3 = main.transform_gold_data(Config3, gdc_3)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_3)

    gdc_4 = main.load_gold_data(Config4)
    gdc_4 = main.transform_gold_data(Config4, gdc_4)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_4)

    gdc_5 = main.load_gold_data(Config5)
    gdc_5 = main.transform_gold_data(Config5, gdc_5)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_5)

    gdc_6 = main.load_gold_data(Config6)
    gdc_6 = main.transform_gold_data(Config6, gdc_6)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_6)

    gdc_new = gdc

    pair_differences = []

    for i, gdi_o in enumerate(gdc_old.gold_data_item_list):

        found = False

        for gdi_n in gdc_new.gold_data_item_list:

            if gdi_o.article_id == gdi_n.article_id:

                if gdi_o.cats != gdi_n.cats:
                    texts_equal = gdi_o.text == gdi_n.text
                    pair_differences.append({"gdi_o": gdi_o, "gdi_n": gdi_n})
                else:
                    print(i)

                found = True
                break

        if not found:
            print(i)

    gdc_d = GoldDataContainer(cats_list=gdc.cats_list)
    for p in pair_differences:
        gdc_d.gold_data_item_list.append(p["gdi_o"])
        gdc_d.gold_data_item_list.append(p["gdi_n"])

    ConfigRoot.gold_data_json_path = "../data/gold_data/differences.json"
    main.persist_gold_data(ConfigRoot, gdc_d)

    embed()

Example #9

0

Show file

def run():
    class Config1_1(ConfigRoot):
        # g1 combined with tr2 produces gold data that was formerly persisted as 's1_articles__tr2_1__sc_sm_alle_anwendungsfelder.json'
        gold_data_json_path = data_flow_registry.gold_data["g1"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule2

    class Config1_2(ConfigRoot):
        gold_data_transform_rule = gold_data_transform_rules.TransformRule8

    class Config2(ConfigRoot):
        # formerly s2 in prodigy, now p1 in prodigy data, and persisted as gold data as g4
        gold_data_json_path = data_flow_registry.gold_data["g4"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule9

    class Config3(ConfigRoot):
        # formerly s3 in prodigy, now p2 in prodigy data, and persisted as gold data as g5
        gold_data_json_path = data_flow_registry.gold_data["g5"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule8

    class Config4(ConfigRoot):
        # formerly s4 in prodigy, now p3 in prodigy data, and persisted as gold data as g6
        gold_data_json_path = data_flow_registry.gold_data["g6"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule8

    class Config5(ConfigRoot):
        # formerly s5 in prodigy, now p4 in prodigy data, and persisted as gold data as g7
        gold_data_json_path = data_flow_registry.gold_data["g7"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule8

    class Config6(ConfigRoot):
        # formerly s6 in prodigy, now p5 in prodigy data, and persisted as gold data as g8
        gold_data_json_path = data_flow_registry.gold_data["g8"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule8

    ConfigRoot.gold_data_json_path = "../data/gold_data/s1_articles__tr2_1__sc_sm_alle_anwendungsfelder_X.json"
    gdc_old = main.load_gold_data(ConfigRoot)

    gdc_1 = main.load_gold_data(Config1_1)
    gdc_1 = main.transform_gold_data(Config1_1, gdc_1)
    gdc_1 = main.transform_gold_data(Config1_2, gdc_1)
    for gdi in gdc_1.gold_data_item_list:
        gdi.source = "g1"  # TODO: Damit dass geht muss golddataitem und gold_data_manager angepasst werden

    gdc = GoldDataContainer(cats_list=gdc_1.cats_list)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_1)

    gdc_2 = main.load_gold_data(Config2)
    gdc_2 = main.transform_gold_data(Config2, gdc_2)
    for gdi in gdc_2.gold_data_item_list:
        gdi.source = "g4"
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_2)

    gdc_3 = main.load_gold_data(Config3)
    gdc_3 = main.transform_gold_data(Config3, gdc_3)
    for gdi in gdc_3.gold_data_item_list:
        gdi.source = "g5"
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_3)

    gdc_4 = main.load_gold_data(Config4)
    gdc_4 = main.transform_gold_data(Config4, gdc_4)
    for gdi in gdc_4.gold_data_item_list:
        gdi.source = "g6"
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_4)

    gdc_5 = main.load_gold_data(Config5)
    gdc_5 = main.transform_gold_data(Config5, gdc_5)
    for gdi in gdc_5.gold_data_item_list:
        gdi.source = "g7"
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_5)

    gdc_6 = main.load_gold_data(Config6)
    gdc_6 = main.transform_gold_data(Config6, gdc_6)
    for gdi in gdc_6.gold_data_item_list:
        gdi.source = "g8"
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_6)

    get_redundancies_by_id(gdc)

Example #10

0

Show file

def transform_to_gold_data(prodigy_data, db_config, ske_config) -> GoldDataContainer:

    cats_list=[cats_dict["text"] for cats_dict in prodigy_data[0]["options"]]

    gold_data_item_list=[]

    # We open this here in case we need it to convert URLs to doc.ids (p2, p3, p4)
    db_connection, db_cursor = open_db_connection(db_config, None, None)

    for row in prodigy_data:

        if row['answer'] != 'accept':
            continue

        answers = row['accept']
        options = row['options']
        cats_assigned = {}

        for i, cat in enumerate(cats_list):

            idx = [ opt['id'] for opt in options if opt['text'] == cat ]

            if len(idx) == 0 or idx[0] not in answers:

                cats_assigned[cat] = 0

            else:

                cats_assigned[cat] = 1

        article_id = None

        # p1: doc.id = row['label'] as well as row['meta']['article_id']
        # p2, p3, p4: row['meta']['url] -> transform to doc.id via SKE or DB
        # p5: doc.id = row['meta']['docid']

        if 'article_id' in row['meta']: # p1
            article_id = row['meta']['article_id']
        elif 'docid' in row['meta']: # p5
            article_id = row['meta']['docid']
        elif 'url' in row['meta']: # p2, p3, p4
            # First we check whether there is an ID translation in the DB
            db_cursor.execute(
                sql.SQL("""
                    SELECT {col_docid}
                    FROM {tbl_ids}
                    WHERE {col_url} = %(url)s
                """).format(
                    col_docid = sql.Identifier('docid'),
                    tbl_ids = sql.Identifier('ske_docid_pos'),
                    col_url = sql.Identifier('url_index1')
                ),
                {
                    'url': row['meta']['url']
                }
            )
            result = db_cursor.fetchone()
            if result:
                article_id = result['docid']
            else:
                # If that fails, we prompt the SKE
                pos = ske_manager.get_pos_from_url(row['meta']['url'])
                article_id = ske_manager.get_docid_from_pos(ske_config, pos)
                # TODO: Ideally we would then insert this new ID translation into the DB
        else:
            raise Exception("Couldn't locate the annotation's text ID.")

        # TODO : Maybe add a text clean-up here to remove the abundant whitespace? Does it make a difference for spacy however?
        gold_data_item_list.append(
            GoldDataItem(
                article_id=article_id,
                text=row["text"] if 'text' in row else row['html'],
                cats=cats_assigned
            )
        )

    close_db_connection(db_connection, db_cursor)

    log_manager.info_global(f"Keeping {len(gold_data_item_list)} data items. ")

    return GoldDataContainer(cats_list=cats_list, gold_data_item_list=gold_data_item_list)

Example #11

0

Show file

File: maxqdata_manager.py Project: acdh-oeaw/mara_nlp_suite

    def save_gold_data_into_container(
        gold_data_container: GoldDataContainer,
        article_annotated_list: List[ArticleAnnotated],
    ):

        cats_list = gold_data_container.cats_list
        gold_data_container.gold_data_item_list = []

        log_manager.info_global(
            "Starting to transform articles with annotations to sentences with annotations.\n"
            "This will take a while.")

        len_article_annotated_list = len(article_annotated_list)
        for i, article_annotated in enumerate(article_annotated_list, start=1):
            if i % 100 == 0 or i % len_article_annotated_list == 0:
                log_manager.info_global(
                    f"at article number: {i}, out of {len_article_annotated_list}"
                )

            sentence_cats_dict = OrderedDict()
            sentence_article_list = sentence_split_func(
                article_annotated.article_file_content_cleaned)

            for sentence in sentence_article_list:

                sentence_cats_dict[str(sentence)] = {
                    cat: 0
                    for cat in cats_list
                }

            for coding in article_annotated.coding_list:

                segment = coding["Segment"]
                segment = re.sub("<.*?>", "", segment)
                sentence_segment_list = sentence_split_func(segment)

                for sentence_segment in sentence_segment_list:

                    for sentence_article in sentence_cats_dict.keys():

                        if str(sentence_segment) in sentence_article:

                            cat = coding["coding_node"].coding_value
                            cat_used_dict = sentence_cats_dict[
                                sentence_article]
                            if cat in cat_used_dict:
                                cat_used_dict[cat] = 1

                            break

            for sentence, cats in sentence_cats_dict.items():

                gold_data_container.gold_data_item_list.append(
                    GoldDataItem(article_id=article_annotated.article_id,
                                 text=sentence,
                                 cats=cats))

        log_manager.info_global(
            f"Transformed {len(article_annotated_list)} articles into {len(gold_data_container.gold_data_item_list)} sentences."
        )

        return gold_data_container