Esempio n. 1
0
def run():

    gdc_1 = main.load_gold_data(Config1_1)
    gdc_1 = main.transform_gold_data(Config1_1, gdc_1)
    gdc_1 = main.transform_gold_data(Config1_2, gdc_1)
    gdc = GoldDataContainer(cats_list=gdc_1.cats_list)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_1)

    gdc_2 = main.load_gold_data(Config2)
    gdc_2 = main.transform_gold_data(Config2, gdc_2)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_2)

    gdc_3 = main.load_gold_data(Config3)
    gdc_3 = main.transform_gold_data(Config3, gdc_3)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_3)

    gdc_4 = main.load_gold_data(Config4)
    gdc_4 = main.transform_gold_data(Config4, gdc_4)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_4)

    gdc_5 = main.load_gold_data(Config5)
    gdc_5 = main.transform_gold_data(Config5, gdc_5)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_5)

    gdc_6 = main.load_gold_data(Config6)
    gdc_6 = main.transform_gold_data(Config6, gdc_6)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_6)

    trainer = main.init_trainer(ConfigTrain, cats_list=gdc.cats_list)
    main.run_training(config=ConfigTrain,
                      trainer=trainer,
                      gold_data_container=gdc)

    embed()
def transform_to_gold_data_articles(
    root_coding_node: CodingNode,
    article_annotated_list: List[ArticleAnnotated],
) -> GoldDataContainer:

    gold_data_container = GoldDataContainer()

    gold_data_container = create_cats_overview(gold_data_container,
                                               root_coding_node,
                                               article_annotated_list)

    def save_gold_data_into_container(
        gold_data_container: GoldDataContainer,
        article_annotated_list: List[ArticleAnnotated],
    ) -> GoldDataContainer:

        gold_data_container.gold_data_item_list = []

        def get_cats_assigned(article_annotated, cats_list):

            article_cats_dict = {}

            relevant_cats = set(
                coding_dict["coding_node"].coding_value
                for coding_dict in article_annotated.coding_list)

            for cat in cats_list:

                if cat in relevant_cats:

                    article_cats_dict[cat] = 1

                else:

                    article_cats_dict[cat] = 0

            return article_cats_dict

        for article_annotated in article_annotated_list:

            gold_data_container.gold_data_item_list.append(
                GoldDataItem(
                    article_id=article_annotated.article_id,
                    text=article_annotated.article_file_content_cleaned,
                    cats=get_cats_assigned(article_annotated,
                                           gold_data_container.cats_list)))

        return gold_data_container

    gold_data_container = save_gold_data_into_container(
        gold_data_container=gold_data_container,
        article_annotated_list=article_annotated_list,
    )

    return gold_data_container
Esempio n. 3
0
def run():

    eval_data_container = main.load_gold_data(ConfigLoadG8)
    eval_data_container = main.transform_gold_data(ConfigLoadG8, eval_data_container)

    modelVR = main.init_trainer(ConfigLoadVRModel)

    main.log_manager.info_global(
        "--------------------------------\n"
        "Evaluating mo11 over the entire dataset g8: \n"
    )
    scores_spacy, scores_manual = modelVR.evaluate(eval_data_container)

    # only look at those examples that mo9 predicts as either AF=SM or AF=SC
    modelAF = main.init_trainer(ConfigLoadAFModel)

    gdis_to_keep = []

    for gdi in eval_data_container.gold_data_item_list: 
    
        doc = modelAF.nlp(gdi.text)

        for cat in ['AF: Social Companions', 'AF: Soziale Medien']: 
            if doc.cats[cat] > 0.5: 
                gdis_to_keep.append(gdi)
                break 

    eval_data_container2 = GoldDataContainer()
    eval_data_container2.cats_list = eval_data_container.cats_list
    eval_data_container2.gold_data_item_list = gdis_to_keep

    main.log_manager.info_global(
        "--------------------------------\n"
        "Evaluating mo11 over those texts in g8 that mo9 predicts to be AF=SM or AF=SC: \n"
    )
    scores_spacy2, scores_manual2 = modelVR.evaluate(eval_data_container2)

    # only look at those examples that were annotated as AF=SM or AF=SC
    
    # we need to reload the data to undo the transformation that removes AF
    eval_data_container = main.load_gold_data(ConfigLoadG8)

    gdis_to_keep = [] 

    for gdi in eval_data_container.gold_data_item_list: 

        for cat in ['AF: Social Companions', 'AF: Soziale Medien']:
            if gdi.cats[cat] == 1:
                gdis_to_keep.append(gdi)
                break 

    eval_data_container3 = GoldDataContainer()
    eval_data_container3.cats_list = eval_data_container.cats_list
    eval_data_container3.gold_data_item_list = gdis_to_keep

    # now apply the transformation that removes all categories except VR
    eval_data_container3 = main.transform_gold_data(ConfigLoadG8, eval_data_container3) 

    main.log_manager.info_global(
        "--------------------------------\n"
        "Evaluating mo11 over those texts in g8 that were annotated as AF=SM or AF=SC: \n"
    )
    scores_spacy3, scores_manual3 = modelVR.evaluate(eval_data_container3)

    embed()
Esempio n. 4
0
def run_2():
    class Config1_1(ConfigRoot):
        # g1 combined with tr2 produces gold data that was formerly persisted as 's1_articles__tr2_1__sc_sm_alle_anwendungsfelder.json'
        gold_data_json_path = data_flow_registry.gold_data["g1"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule2

    class Config1_2(ConfigRoot):
        gold_data_transform_rule = gold_data_transform_rules.TransformRule8

    class Config2(ConfigRoot):
        # formerly s2 in prodigy, now p1 in prodigy data, and persisted as gold data as g4
        gold_data_json_path = data_flow_registry.gold_data["g4"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule9

    class Config3(ConfigRoot):
        # formerly s3 in prodigy, now p2 in prodigy data, and persisted as gold data as g5
        gold_data_json_path = data_flow_registry.gold_data["g5"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule8

    class Config4(ConfigRoot):
        # formerly s4 in prodigy, now p3 in prodigy data, and persisted as gold data as g6
        gold_data_json_path = data_flow_registry.gold_data["g6"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule8

    class Config5(ConfigRoot):
        # formerly s5 in prodigy, now p4 in prodigy data, and persisted as gold data as g7
        gold_data_json_path = data_flow_registry.gold_data["g7"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule8

    class Config6(ConfigRoot):
        # formerly s6 in prodigy, now p5 in prodigy data, and persisted as gold data as g8
        gold_data_json_path = data_flow_registry.gold_data["g8"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule8

    ConfigRoot.gold_data_json_path = "../data/gold_data/s1_articles__tr2_1__sc_sm_alle_anwendungsfelder_X.json"
    gdc_old = main.load_gold_data(ConfigRoot)

    gdc_1 = main.load_gold_data(Config1_1)
    gdc_1 = main.transform_gold_data(Config1_1, gdc_1)
    gdc_1 = main.transform_gold_data(Config1_2, gdc_1)
    gdc = GoldDataContainer(cats_list=gdc_1.cats_list)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_1)

    gdc_2 = main.load_gold_data(Config2)
    gdc_2 = main.transform_gold_data(Config2, gdc_2)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_2)

    gdc_3 = main.load_gold_data(Config3)
    gdc_3 = main.transform_gold_data(Config3, gdc_3)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_3)

    gdc_4 = main.load_gold_data(Config4)
    gdc_4 = main.transform_gold_data(Config4, gdc_4)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_4)

    gdc_5 = main.load_gold_data(Config5)
    gdc_5 = main.transform_gold_data(Config5, gdc_5)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_5)

    gdc_6 = main.load_gold_data(Config6)
    gdc_6 = main.transform_gold_data(Config6, gdc_6)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_6)

    gdc_new = gdc

    pair_differences = []

    for i, gdi_o in enumerate(gdc_old.gold_data_item_list):

        found = False

        for gdi_n in gdc_new.gold_data_item_list:

            if gdi_o.article_id == gdi_n.article_id:

                if gdi_o.cats != gdi_n.cats:
                    texts_equal = gdi_o.text == gdi_n.text
                    pair_differences.append({"gdi_o": gdi_o, "gdi_n": gdi_n})
                else:
                    print(i)

                found = True
                break

        if not found:
            print(i)

    gdc_d = GoldDataContainer(cats_list=gdc.cats_list)
    for p in pair_differences:
        gdc_d.gold_data_item_list.append(p["gdi_o"])
        gdc_d.gold_data_item_list.append(p["gdi_n"])

    ConfigRoot.gold_data_json_path = "../data/gold_data/differences.json"
    main.persist_gold_data(ConfigRoot, gdc_d)

    embed()
Esempio n. 5
0
def run():
    class Config1_1(ConfigRoot):
        # g1 combined with tr2 produces gold data that was formerly persisted as 's1_articles__tr2_1__sc_sm_alle_anwendungsfelder.json'
        gold_data_json_path = data_flow_registry.gold_data["g1"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule2

    class Config1_2(ConfigRoot):
        gold_data_transform_rule = gold_data_transform_rules.TransformRule8

    class Config2(ConfigRoot):
        # formerly s2 in prodigy, now p1 in prodigy data, and persisted as gold data as g4
        gold_data_json_path = data_flow_registry.gold_data["g4"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule9

    class Config3(ConfigRoot):
        # formerly s3 in prodigy, now p2 in prodigy data, and persisted as gold data as g5
        gold_data_json_path = data_flow_registry.gold_data["g5"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule8

    class Config4(ConfigRoot):
        # formerly s4 in prodigy, now p3 in prodigy data, and persisted as gold data as g6
        gold_data_json_path = data_flow_registry.gold_data["g6"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule8

    class Config5(ConfigRoot):
        # formerly s5 in prodigy, now p4 in prodigy data, and persisted as gold data as g7
        gold_data_json_path = data_flow_registry.gold_data["g7"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule8

    class Config6(ConfigRoot):
        # formerly s6 in prodigy, now p5 in prodigy data, and persisted as gold data as g8
        gold_data_json_path = data_flow_registry.gold_data["g8"]["path"]
        gold_data_transform_rule = gold_data_transform_rules.TransformRule8

    ConfigRoot.gold_data_json_path = "../data/gold_data/s1_articles__tr2_1__sc_sm_alle_anwendungsfelder_X.json"
    gdc_old = main.load_gold_data(ConfigRoot)

    gdc_1 = main.load_gold_data(Config1_1)
    gdc_1 = main.transform_gold_data(Config1_1, gdc_1)
    gdc_1 = main.transform_gold_data(Config1_2, gdc_1)
    for gdi in gdc_1.gold_data_item_list:
        gdi.source = "g1"  # TODO: Damit dass geht muss golddataitem und gold_data_manager angepasst werden

    gdc = GoldDataContainer(cats_list=gdc_1.cats_list)
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_1)

    gdc_2 = main.load_gold_data(Config2)
    gdc_2 = main.transform_gold_data(Config2, gdc_2)
    for gdi in gdc_2.gold_data_item_list:
        gdi.source = "g4"
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_2)

    gdc_3 = main.load_gold_data(Config3)
    gdc_3 = main.transform_gold_data(Config3, gdc_3)
    for gdi in gdc_3.gold_data_item_list:
        gdi.source = "g5"
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_3)

    gdc_4 = main.load_gold_data(Config4)
    gdc_4 = main.transform_gold_data(Config4, gdc_4)
    for gdi in gdc_4.gold_data_item_list:
        gdi.source = "g6"
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_4)

    gdc_5 = main.load_gold_data(Config5)
    gdc_5 = main.transform_gold_data(Config5, gdc_5)
    for gdi in gdc_5.gold_data_item_list:
        gdi.source = "g7"
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_5)

    gdc_6 = main.load_gold_data(Config6)
    gdc_6 = main.transform_gold_data(Config6, gdc_6)
    for gdi in gdc_6.gold_data_item_list:
        gdi.source = "g8"
    gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_6)

    get_redundancies_by_id(gdc)
Esempio n. 6
0
def transform_to_gold_data(prodigy_data, db_config, ske_config) -> GoldDataContainer:

    cats_list=[cats_dict["text"] for cats_dict in prodigy_data[0]["options"]]

    gold_data_item_list=[]

    # We open this here in case we need it to convert URLs to doc.ids (p2, p3, p4)
    db_connection, db_cursor = open_db_connection(db_config, None, None)

    for row in prodigy_data:

        if row['answer'] != 'accept':
            continue

        answers = row['accept']
        options = row['options']
        cats_assigned = {}

        for i, cat in enumerate(cats_list):

            idx = [ opt['id'] for opt in options if opt['text'] == cat ]

            if len(idx) == 0 or idx[0] not in answers:

                cats_assigned[cat] = 0

            else:

                cats_assigned[cat] = 1

        article_id = None

        # p1: doc.id = row['label'] as well as row['meta']['article_id']
        # p2, p3, p4: row['meta']['url] -> transform to doc.id via SKE or DB
        # p5: doc.id = row['meta']['docid']

        if 'article_id' in row['meta']: # p1
            article_id = row['meta']['article_id']
        elif 'docid' in row['meta']: # p5
            article_id = row['meta']['docid']
        elif 'url' in row['meta']: # p2, p3, p4
            # First we check whether there is an ID translation in the DB
            db_cursor.execute(
                sql.SQL("""
                    SELECT {col_docid}
                    FROM {tbl_ids}
                    WHERE {col_url} = %(url)s
                """).format(
                    col_docid = sql.Identifier('docid'),
                    tbl_ids = sql.Identifier('ske_docid_pos'),
                    col_url = sql.Identifier('url_index1')
                ),
                {
                    'url': row['meta']['url']
                }
            )
            result = db_cursor.fetchone()
            if result:
                article_id = result['docid']
            else:
                # If that fails, we prompt the SKE
                pos = ske_manager.get_pos_from_url(row['meta']['url'])
                article_id = ske_manager.get_docid_from_pos(ske_config, pos)
                # TODO: Ideally we would then insert this new ID translation into the DB
        else:
            raise Exception("Couldn't locate the annotation's text ID.")

        # TODO : Maybe add a text clean-up here to remove the abundant whitespace? Does it make a difference for spacy however?
        gold_data_item_list.append(
            GoldDataItem(
                article_id=article_id,
                text=row["text"] if 'text' in row else row['html'],
                cats=cats_assigned
            )
        )

    close_db_connection(db_connection, db_cursor)

    log_manager.info_global(f"Keeping {len(gold_data_item_list)} data items. ")

    return GoldDataContainer(cats_list=cats_list, gold_data_item_list=gold_data_item_list)
def transform_to_gold_data_sentences(
    spacy_base_model: str,
    root_coding_node: CodingNode,
    article_annotated_list: List[ArticleAnnotated],
    sentence_split_func,
):

    gold_data_container = GoldDataContainer()

    gold_data_container = create_cats_overview(gold_data_container,
                                               root_coding_node,
                                               article_annotated_list)

    def save_gold_data_into_container(
        gold_data_container: GoldDataContainer,
        article_annotated_list: List[ArticleAnnotated],
    ):

        cats_list = gold_data_container.cats_list
        gold_data_container.gold_data_item_list = []

        log_manager.info_global(
            "Starting to transform articles with annotations to sentences with annotations.\n"
            "This will take a while.")

        len_article_annotated_list = len(article_annotated_list)
        for i, article_annotated in enumerate(article_annotated_list, start=1):
            if i % 100 == 0 or i % len_article_annotated_list == 0:
                log_manager.info_global(
                    f"at article number: {i}, out of {len_article_annotated_list}"
                )

            sentence_cats_dict = OrderedDict()
            sentence_article_list = sentence_split_func(
                article_annotated.article_file_content_cleaned)

            for sentence in sentence_article_list:

                sentence_cats_dict[str(sentence)] = {
                    cat: 0
                    for cat in cats_list
                }

            for coding in article_annotated.coding_list:

                segment = coding["Segment"]
                segment = re.sub("<.*?>", "", segment)
                sentence_segment_list = sentence_split_func(segment)

                for sentence_segment in sentence_segment_list:

                    for sentence_article in sentence_cats_dict.keys():

                        if str(sentence_segment) in sentence_article:

                            cat = coding["coding_node"].coding_value
                            cat_used_dict = sentence_cats_dict[
                                sentence_article]
                            if cat in cat_used_dict:
                                cat_used_dict[cat] = 1

                            break

            for sentence, cats in sentence_cats_dict.items():

                gold_data_container.gold_data_item_list.append(
                    GoldDataItem(article_id=article_annotated.article_id,
                                 text=sentence,
                                 cats=cats))

        log_manager.info_global(
            f"Transformed {len(article_annotated_list)} articles into {len(gold_data_container.gold_data_item_list)} sentences."
        )

        return gold_data_container

    gold_data_container = save_gold_data_into_container(
        gold_data_container=gold_data_container,
        article_annotated_list=article_annotated_list,
    )

    return gold_data_container