def run_4(): class C1(ConfigRoot): gold_data_json_path = "../data/gold_data/g1.json" class C2(ConfigRoot): gold_data_json_path = "../data/gold_data/s1_articles__tr2_1__sc_sm_alle_anwendungsfelder_ORIGINAL.json" class C3(ConfigRoot): gold_data_json_path = "../data/gold_data/differences.json" gdc1 = main.load_gold_data(C1) gdc2 = main.load_gold_data(C2) gdc3 = main.load_gold_data(C3) a = None for gdi in gdc1.gold_data_item_list: if gdi.article_id == "STANDARD_200203161928100152": if a is not None: raise Exception() a = gdi b = None for gdi in gdc1.gold_data_item_list: if gdi.article_id == "STANDARD_200203161928100152": if b is not None: raise Exception() b = gdi class C4(ConfigRoot): gold_data_json_path = data_flow_registry.gold_data["g1"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule2 class C5(ConfigRoot): gold_data_transform_rule = gold_data_transform_rules.TransformRule8 gdc4 = main.load_gold_data(C4) id = None for i, gdi in enumerate(gdc4.gold_data_item_list): if gdi.article_id == "STANDARD_200203161928100152": if id is not None: raise Exception() id = i del gdc4.gold_data_item_list[0:id] del gdc4.gold_data_item_list[1:] gdc4 = main.transform_gold_data(C4, gdc4) gdc4 = main.transform_gold_data(C5, gdc4) print()
def run(): gdc_1 = main.load_gold_data(Config1_1) gdc_1 = main.transform_gold_data(Config1_1, gdc_1) gdc_1 = main.transform_gold_data(Config1_2, gdc_1) gdc = GoldDataContainer(cats_list=gdc_1.cats_list) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_1) gdc_2 = main.load_gold_data(Config2) gdc_2 = main.transform_gold_data(Config2, gdc_2) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_2) gdc_3 = main.load_gold_data(Config3) gdc_3 = main.transform_gold_data(Config3, gdc_3) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_3) gdc_4 = main.load_gold_data(Config4) gdc_4 = main.transform_gold_data(Config4, gdc_4) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_4) gdc_5 = main.load_gold_data(Config5) gdc_5 = main.transform_gold_data(Config5, gdc_5) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_5) gdc_6 = main.load_gold_data(Config6) gdc_6 = main.transform_gold_data(Config6, gdc_6) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_6) trainer = main.init_trainer(ConfigTrain, cats_list=gdc.cats_list) main.run_training(config=ConfigTrain, trainer=trainer, gold_data_container=gdc) embed()
def run(): gdc = main.load_gold_data(ConfigBase) gdc = main.transform_gold_data(ConfigBase, gdc) trainer = main.init_trainer(ConfigTdc100, cats_list=gdc.cats_list) main.run_training(config=ConfigTdc100, trainer=trainer, gold_data_container=gdc) trainer = main.init_trainer(ConfigTdc80, cats_list=gdc.cats_list) main.run_training(config=ConfigTdc80, trainer=trainer, gold_data_container=gdc)
def run_5(): class C1(ConfigRoot): gold_data_json_path = "../data/gold_data/g1.json" class C2(ConfigRoot): gold_data_json_path = "../data/gold_data/s1_articles__tr2_1__sc_sm_alle_anwendungsfelder_ORIGINAL.json" gdc_n = main.load_gold_data(C1) gdc_o = main.load_gold_data(C2) ConfigRoot.gold_data_json_path = "../data/gold_data/s1_articles__tr2_1__sc_sm_alle_anwendungsfelder_X.json" gdc_o_X = main.load_gold_data(ConfigRoot) def reduce_gdc(gdc): # id = None multiples = [] for i, gdi in enumerate(gdc.gold_data_item_list): if gdi.article_id == "STANDARD_200203161928100152": multiples.append(gdi) # if id is not None: # raise Exception() # id = i del gdc.gold_data_item_list[0:id] # del gdc.gold_data_item_list[1:] # reduce_gdc(gdc_n) # reduce_gdc(gdc_o) reduce_gdc(gdc_o_X) class C3(ConfigRoot): gold_data_transform_rule = gold_data_transform_rules.TransformRule2 class C4(ConfigRoot): gold_data_transform_rule = gold_data_transform_rules.TransformRule8 gdc_n = main.transform_gold_data(C3, gdc_n) gdc_n = main.transform_gold_data(C4, gdc_n) gdc_o = main.transform_gold_data(C4, gdc_o) print()
def run(): gdc = main.load_gold_data(ConfigSub) gdc = main.transform_gold_data(ConfigSub, gdc) for i in range(30): if i == 0: ConfigSub.should_load_model = False ConfigSub.should_create_model = True else: ConfigSub.should_load_model = True ConfigSub.should_create_model = False trainer = main.init_trainer(config=ConfigSub, cats_list=gdc.cats_list) main.run_training(ConfigSub, trainer, gdc)
def train(trainer1, trainer2): gdc = main.load_gold_data(ConfigTrainCompareBase) gdc = main.transform_gold_data(ConfigTrainCompareBase, gdc) if trainer1 is None: ConfigTrainCompareBase.should_load_model = False ConfigTrainCompareBase.should_create_model = True trainer1 = main.init_trainer(ConfigTrainCompare1, cats_list=gdc.cats_list) trainer2 = main.init_trainer(ConfigTrainCompare2, cats_list=gdc.cats_list) main.run_training(ConfigTrainCompare1, trainer1, gdc) main.run_training(ConfigTrainCompare2, trainer2, gdc) return trainer1, trainer2
def run(): gdc = main.load_gold_data(ConfigSub) gdc = main.transform_gold_data(ConfigSub, gdc) trainer = main.init_trainer(config=ConfigSub, cats_list=gdc.cats_list) main.run_training(ConfigSub, trainer, gdc)
def run(): # get the VR info eval_data_container = main.load_gold_data(ConfigLoadG1) eval_data_container_VR = main.transform_gold_data(ConfigTransformG1VR, eval_data_container) df_VR = pd.DataFrame( data=[{ "article_id": gdi.article_id, "VR=ja": gdi.cats['Verantwortungsreferenz'] == 1, } for gdi in eval_data_container_VR.gold_data_item_list]) # get the AF info eval_data_container = main.load_gold_data(ConfigLoadG1) eval_data_container_AF = main.transform_gold_data( ConfigTransformG1AF_Part1, eval_data_container) #eval_data_container_AF = main.transform_gold_data(ConfigTransformG1AF_Part2, eval_data_container_AF) df_AF = pd.DataFrame(data=[{ "article_id": gdi.article_id, "AF=SM": gdi.cats['AF: Soziale Medien'] == 1, "AF=SC": gdi.cats['AF: Social Companions'] == 1, } for gdi in eval_data_container_AF.gold_data_item_list]) # for each text, read from the DB how many LM it contains db_connection, db_cursor = db_manager.open_db_connection( db_config={ "host": credentials.db_host, "dbname": credentials.db_name, "user": credentials.db_user, "password": credentials.db_password, "port": credentials.db_port }) db_cursor.execute( sql.SQL(""" select t.docid as id, count(distinct t.keyword_id) as dist, sum(t.token_count) as total from {table_name} as t where t.docid = any( %(docid_list)s ) group by t.docid order by t.docid asc """).format( table_name=sql.Identifier('index_2__mara002__lmvr_tokens')), { 'docid_list': [ gdi.article_id for gdi in eval_data_container.gold_data_item_list ], }) results = db_cursor.fetchall() df_LM = pd.DataFrame(data=[{ "article_id": r['id'], "LMs total": r['total'], "LMs distinct": r['dist'], } for r in results]) # close db connection db_manager.close_db_connection(db_connection, db_cursor) # merge the 3 dataframes df = df_LM.merge(df_AF, how='outer', on='article_id') df = df.merge(df_VR, how='outer', on='article_id') # the LM table in the db doesn't contain all texts, so we have NaN values. Replace those with 0. df['LMs total'] = df['LMs total'].fillna(0) df['LMs distinct'] = df['LMs distinct'].fillna(0) # define shortcuts to filter the dataframe maskAF = (df['AF=SC'] == True) | (df['AF=SM'] == True) maskVR = (df['VR=ja'] == True) main.log_manager.info_global( "--------------------------------\n" "Calculations complete. \n" "You can now access the DataFrame as `df`. \n" "There are 2 masks provided as `maskAF` (SC or SM) and `maskVR` (trivial). \n" ) # usage example: # df[maskAF & maskVR] # df[~maskVR] embed()
def run(): eval_data_container = main.load_gold_data(ConfigLoadG8) eval_data_container = main.transform_gold_data(ConfigLoadG8, eval_data_container) modelVR = main.init_trainer(ConfigLoadVRModel) main.log_manager.info_global( "--------------------------------\n" "Evaluating mo11 over the entire dataset g8: \n" ) scores_spacy, scores_manual = modelVR.evaluate(eval_data_container) # only look at those examples that mo9 predicts as either AF=SM or AF=SC modelAF = main.init_trainer(ConfigLoadAFModel) gdis_to_keep = [] for gdi in eval_data_container.gold_data_item_list: doc = modelAF.nlp(gdi.text) for cat in ['AF: Social Companions', 'AF: Soziale Medien']: if doc.cats[cat] > 0.5: gdis_to_keep.append(gdi) break eval_data_container2 = GoldDataContainer() eval_data_container2.cats_list = eval_data_container.cats_list eval_data_container2.gold_data_item_list = gdis_to_keep main.log_manager.info_global( "--------------------------------\n" "Evaluating mo11 over those texts in g8 that mo9 predicts to be AF=SM or AF=SC: \n" ) scores_spacy2, scores_manual2 = modelVR.evaluate(eval_data_container2) # only look at those examples that were annotated as AF=SM or AF=SC # we need to reload the data to undo the transformation that removes AF eval_data_container = main.load_gold_data(ConfigLoadG8) gdis_to_keep = [] for gdi in eval_data_container.gold_data_item_list: for cat in ['AF: Social Companions', 'AF: Soziale Medien']: if gdi.cats[cat] == 1: gdis_to_keep.append(gdi) break eval_data_container3 = GoldDataContainer() eval_data_container3.cats_list = eval_data_container.cats_list eval_data_container3.gold_data_item_list = gdis_to_keep # now apply the transformation that removes all categories except VR eval_data_container3 = main.transform_gold_data(ConfigLoadG8, eval_data_container3) main.log_manager.info_global( "--------------------------------\n" "Evaluating mo11 over those texts in g8 that were annotated as AF=SM or AF=SC: \n" ) scores_spacy3, scores_manual3 = modelVR.evaluate(eval_data_container3) embed()
def run_2(): class Config1_1(ConfigRoot): # g1 combined with tr2 produces gold data that was formerly persisted as 's1_articles__tr2_1__sc_sm_alle_anwendungsfelder.json' gold_data_json_path = data_flow_registry.gold_data["g1"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule2 class Config1_2(ConfigRoot): gold_data_transform_rule = gold_data_transform_rules.TransformRule8 class Config2(ConfigRoot): # formerly s2 in prodigy, now p1 in prodigy data, and persisted as gold data as g4 gold_data_json_path = data_flow_registry.gold_data["g4"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule9 class Config3(ConfigRoot): # formerly s3 in prodigy, now p2 in prodigy data, and persisted as gold data as g5 gold_data_json_path = data_flow_registry.gold_data["g5"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule8 class Config4(ConfigRoot): # formerly s4 in prodigy, now p3 in prodigy data, and persisted as gold data as g6 gold_data_json_path = data_flow_registry.gold_data["g6"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule8 class Config5(ConfigRoot): # formerly s5 in prodigy, now p4 in prodigy data, and persisted as gold data as g7 gold_data_json_path = data_flow_registry.gold_data["g7"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule8 class Config6(ConfigRoot): # formerly s6 in prodigy, now p5 in prodigy data, and persisted as gold data as g8 gold_data_json_path = data_flow_registry.gold_data["g8"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule8 ConfigRoot.gold_data_json_path = "../data/gold_data/s1_articles__tr2_1__sc_sm_alle_anwendungsfelder_X.json" gdc_old = main.load_gold_data(ConfigRoot) gdc_1 = main.load_gold_data(Config1_1) gdc_1 = main.transform_gold_data(Config1_1, gdc_1) gdc_1 = main.transform_gold_data(Config1_2, gdc_1) gdc = GoldDataContainer(cats_list=gdc_1.cats_list) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_1) gdc_2 = main.load_gold_data(Config2) gdc_2 = main.transform_gold_data(Config2, gdc_2) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_2) gdc_3 = main.load_gold_data(Config3) gdc_3 = main.transform_gold_data(Config3, gdc_3) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_3) gdc_4 = main.load_gold_data(Config4) gdc_4 = main.transform_gold_data(Config4, gdc_4) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_4) gdc_5 = main.load_gold_data(Config5) gdc_5 = main.transform_gold_data(Config5, gdc_5) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_5) gdc_6 = main.load_gold_data(Config6) gdc_6 = main.transform_gold_data(Config6, gdc_6) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_6) gdc_new = gdc pair_differences = [] for i, gdi_o in enumerate(gdc_old.gold_data_item_list): found = False for gdi_n in gdc_new.gold_data_item_list: if gdi_o.article_id == gdi_n.article_id: if gdi_o.cats != gdi_n.cats: texts_equal = gdi_o.text == gdi_n.text pair_differences.append({"gdi_o": gdi_o, "gdi_n": gdi_n}) else: print(i) found = True break if not found: print(i) gdc_d = GoldDataContainer(cats_list=gdc.cats_list) for p in pair_differences: gdc_d.gold_data_item_list.append(p["gdi_o"]) gdc_d.gold_data_item_list.append(p["gdi_n"]) ConfigRoot.gold_data_json_path = "../data/gold_data/differences.json" main.persist_gold_data(ConfigRoot, gdc_d) embed()
def run(): class Config1_1(ConfigRoot): # g1 combined with tr2 produces gold data that was formerly persisted as 's1_articles__tr2_1__sc_sm_alle_anwendungsfelder.json' gold_data_json_path = data_flow_registry.gold_data["g1"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule2 class Config1_2(ConfigRoot): gold_data_transform_rule = gold_data_transform_rules.TransformRule8 class Config2(ConfigRoot): # formerly s2 in prodigy, now p1 in prodigy data, and persisted as gold data as g4 gold_data_json_path = data_flow_registry.gold_data["g4"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule9 class Config3(ConfigRoot): # formerly s3 in prodigy, now p2 in prodigy data, and persisted as gold data as g5 gold_data_json_path = data_flow_registry.gold_data["g5"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule8 class Config4(ConfigRoot): # formerly s4 in prodigy, now p3 in prodigy data, and persisted as gold data as g6 gold_data_json_path = data_flow_registry.gold_data["g6"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule8 class Config5(ConfigRoot): # formerly s5 in prodigy, now p4 in prodigy data, and persisted as gold data as g7 gold_data_json_path = data_flow_registry.gold_data["g7"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule8 class Config6(ConfigRoot): # formerly s6 in prodigy, now p5 in prodigy data, and persisted as gold data as g8 gold_data_json_path = data_flow_registry.gold_data["g8"]["path"] gold_data_transform_rule = gold_data_transform_rules.TransformRule8 ConfigRoot.gold_data_json_path = "../data/gold_data/s1_articles__tr2_1__sc_sm_alle_anwendungsfelder_X.json" gdc_old = main.load_gold_data(ConfigRoot) gdc_1 = main.load_gold_data(Config1_1) gdc_1 = main.transform_gold_data(Config1_1, gdc_1) gdc_1 = main.transform_gold_data(Config1_2, gdc_1) for gdi in gdc_1.gold_data_item_list: gdi.source = "g1" # TODO: Damit dass geht muss golddataitem und gold_data_manager angepasst werden gdc = GoldDataContainer(cats_list=gdc_1.cats_list) gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_1) gdc_2 = main.load_gold_data(Config2) gdc_2 = main.transform_gold_data(Config2, gdc_2) for gdi in gdc_2.gold_data_item_list: gdi.source = "g4" gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_2) gdc_3 = main.load_gold_data(Config3) gdc_3 = main.transform_gold_data(Config3, gdc_3) for gdi in gdc_3.gold_data_item_list: gdi.source = "g5" gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_3) gdc_4 = main.load_gold_data(Config4) gdc_4 = main.transform_gold_data(Config4, gdc_4) for gdi in gdc_4.gold_data_item_list: gdi.source = "g6" gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_4) gdc_5 = main.load_gold_data(Config5) gdc_5 = main.transform_gold_data(Config5, gdc_5) for gdi in gdc_5.gold_data_item_list: gdi.source = "g7" gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_5) gdc_6 = main.load_gold_data(Config6) gdc_6 = main.transform_gold_data(Config6, gdc_6) for gdi in gdc_6.gold_data_item_list: gdi.source = "g8" gdc = gold_data_manager.merge_assuming_identical_categories(gdc, gdc_6) get_redundancies_by_id(gdc)