def run(): db_config = { "host": ConfigRoot.db_host, "dbname": ConfigRoot.db_name, "user": ConfigRoot.db_user, "password": ConfigRoot.db_password, "port": ConfigRoot.db_port } db_connection, db_cursor = db_manager.open_db_connection(db_config) table_name_ref_articles_old = ConfigIndexBase.table_name_ref_articles table_name_ref_articles_new = table_name_ref_articles_old + "_random_subset" ConfigIndexBase.table_name_ref_articles = table_name_ref_articles_new create_main_reference_table( db_cursor, db_connection, table_name_ref_articles_old, table_name_ref_articles_new) # TODO: comment this out if not needed for i in range(100): if i == 0: trainer1 = None trainer2 = None trainer1, trainer2 = train( trainer1, trainer2) # TODO: comment this out if not needed drop_index(db_cursor, db_connection) # TODO: comment this out if not needed create_index(trainer1, trainer2) # TODO: comment this out if not needed compare_index(db_cursor, i) # TODO: comment this out if not needed db_manager.close_db_connection(db_connection, db_cursor)
def stream_from_db_with_predictions(ske_config, db_config, index_table_name): log_manager.debug_global("Streaming from DB with predictions ...") db_connection = None db_cursor = None (db_connection, db_cursor) = db_manager.open_db_connection(db_config, db_connection, db_cursor) try: while True: db_cursor.execute( sql.SQL( 'SELECT *, ("AF: Social Companions" + "AF: Soziale Medien") AS AF_SC_SM ' 'FROM {index_table_name} ' 'WHERE already_annotated = FALSE ' 'AND already_selected = FALSE ' # left-over from the old system "AND ((selected_on IS NULL) OR (selected_on < (NOW() - INTERVAL '2 days'))) " 'ORDER BY AF_SC_SM ASC ' 'LIMIT 1').format( index_table_name=sql.Identifier(index_table_name))) result = db_cursor.fetchone() url = result["url"] _select_text(db_connection, db_cursor, index_table_name, 'url', url) options = _preselect_options(result) ske_doc = ske_manager.get_doc_from_url(ske_config, url) yield { "text": ske_doc["text"], "options": options['cats_as_options'], "accept": options['options_accepted'], "meta": { "url": url, "scores": options['scores_text'] } } except Exception as ex: print(ex) finally: db_manager.close_db_connection(db_connection, db_cursor)
def _main(): try: pos_start = _init_index_table() _populate_index_table(pos_start) db_connection.commit() ske_manager.close_session() except Exception as e: db_connection.rollback() ske_manager.close_session() raise e finally: db_manager.close_db_connection(db_connection, db_cursor)
def create_tables(db_config, index1_table_name, index2_table_names): (db_connection, db_cursor) = db_manager.open_db_connection(db_config) try: log_manager.debug_global("Dropping tables ...") db_cursor.execute( sql.SQL(""" DROP TABLE IF EXISTS {table_keywords}, {table_scores}, {table_tokens} CASCADE; DROP INDEX IF EXISTS {score_idx} CASCADE; """).format( table_keywords=sql.Identifier(index2_table_names['keywords']), table_scores=sql.Identifier(index2_table_names['scores']), table_tokens=sql.Identifier(index2_table_names['tokens']), score_idx=sql.Identifier( 'index_2__mara002__lmvr_scores_score_rarity_diversity_idx') )) # table 1: keywords log_manager.debug_global( f"Creating table {index2_table_names['keywords']} ...") db_cursor.execute( sql.SQL(""" CREATE TABLE {table} ( {pk} varchar NOT NULL, corpus_count int4 NOT NULL, category varchar NOT NULL, CONSTRAINT index_2__mara002__lmvr_keywords_pk PRIMARY KEY ({pk}) ); """).format(table=sql.Identifier(index2_table_names['keywords']), pk=sql.Identifier('keyword_id'))) # table 2: texts + scores log_manager.debug_global( f"Creating table {index2_table_names['scores']} ...") db_cursor.execute( sql.SQL(""" CREATE TABLE {table} ( {pk} varchar NOT NULL, {score1} numeric NOT NULL, already_annotated bool NULL, selected_on timestamptz NULL, CONSTRAINT index_2__mara002__lmvr_scores_pk PRIMARY KEY ({pk}) ); CREATE INDEX index_2__mara002__lmvr_scores_score_rarity_diversity_idx ON {table} USING btree ({score1} DESC); """).format(table=sql.Identifier(index2_table_names['scores']), pk=sql.Identifier('docid'), score1=sql.Identifier('score_rarity_diversity'))) # table 3: keywords in texts log_manager.debug_global( f"Creating table {index2_table_names['tokens']} ...") db_cursor.execute( sql.SQL(""" CREATE TABLE {table} ( {fk_texts} varchar NOT NULL, {fk_kw} varchar NOT NULL, token_count int4 NOT NULL DEFAULT 0, CONSTRAINT index_2__mara002__lmvr_tokens_pk PRIMARY KEY ({fk_texts}, {fk_kw}), CONSTRAINT index_2__mara002__lmvr_tokens_fk FOREIGN KEY ({fk_texts}) REFERENCES {table_texts}({fk_texts}) ON UPDATE CASCADE ON DELETE CASCADE, CONSTRAINT index_2__mara002__lmvr_tokens_fk_keyword FOREIGN KEY ({fk_kw}) REFERENCES {table_kw}({fk_kw}) ON UPDATE CASCADE ON DELETE CASCADE ); """).format( table=sql.Identifier(index2_table_names['tokens']), table_texts=sql.Identifier(index2_table_names['scores']), fk_texts=sql.Identifier('docid'), table_kw=sql.Identifier(index2_table_names['keywords']), fk_kw=sql.Identifier('keyword_id'))) db_connection.commit() except Exception as e: db_connection.rollback() raise e finally: db_manager.close_db_connection(db_connection, db_cursor) return # TODO: Is this empty return on purpose?
def run(): # get the VR info eval_data_container = main.load_gold_data(ConfigLoadG1) eval_data_container_VR = main.transform_gold_data(ConfigTransformG1VR, eval_data_container) df_VR = pd.DataFrame( data=[{ "article_id": gdi.article_id, "VR=ja": gdi.cats['Verantwortungsreferenz'] == 1, } for gdi in eval_data_container_VR.gold_data_item_list]) # get the AF info eval_data_container = main.load_gold_data(ConfigLoadG1) eval_data_container_AF = main.transform_gold_data( ConfigTransformG1AF_Part1, eval_data_container) #eval_data_container_AF = main.transform_gold_data(ConfigTransformG1AF_Part2, eval_data_container_AF) df_AF = pd.DataFrame(data=[{ "article_id": gdi.article_id, "AF=SM": gdi.cats['AF: Soziale Medien'] == 1, "AF=SC": gdi.cats['AF: Social Companions'] == 1, } for gdi in eval_data_container_AF.gold_data_item_list]) # for each text, read from the DB how many LM it contains db_connection, db_cursor = db_manager.open_db_connection( db_config={ "host": credentials.db_host, "dbname": credentials.db_name, "user": credentials.db_user, "password": credentials.db_password, "port": credentials.db_port }) db_cursor.execute( sql.SQL(""" select t.docid as id, count(distinct t.keyword_id) as dist, sum(t.token_count) as total from {table_name} as t where t.docid = any( %(docid_list)s ) group by t.docid order by t.docid asc """).format( table_name=sql.Identifier('index_2__mara002__lmvr_tokens')), { 'docid_list': [ gdi.article_id for gdi in eval_data_container.gold_data_item_list ], }) results = db_cursor.fetchall() df_LM = pd.DataFrame(data=[{ "article_id": r['id'], "LMs total": r['total'], "LMs distinct": r['dist'], } for r in results]) # close db connection db_manager.close_db_connection(db_connection, db_cursor) # merge the 3 dataframes df = df_LM.merge(df_AF, how='outer', on='article_id') df = df.merge(df_VR, how='outer', on='article_id') # the LM table in the db doesn't contain all texts, so we have NaN values. Replace those with 0. df['LMs total'] = df['LMs total'].fillna(0) df['LMs distinct'] = df['LMs distinct'].fillna(0) # define shortcuts to filter the dataframe maskAF = (df['AF=SC'] == True) | (df['AF=SM'] == True) maskVR = (df['VR=ja'] == True) main.log_manager.info_global( "--------------------------------\n" "Calculations complete. \n" "You can now access the DataFrame as `df`. \n" "There are 2 masks provided as `maskAF` (SC or SM) and `maskVR` (trivial). \n" ) # usage example: # df[maskAF & maskVR] # df[~maskVR] embed()
def on_exit(controller): log_manager.debug_global("Prodigy: exiting ...") db_manager.close_db_connection(db_connection, db_cursor)
def run(ske_config, db_config, docid_table_name, index1_table_name, index2_table_names, should_drop_create_table=False): (db_connection, db_cursor) = db_manager.open_db_connection(db_config) if should_drop_create_table: create_table(db_connection, db_cursor, docid_table_name) # Direction 1: look for URLs that are not yet in the translation table # Hannes says that pos -> docid is faster than docid -> pos # because the SKE uses pos as internal indices log_manager.debug_global("Looking for URLs ...") url_records = select_urls_from_index1(db_cursor, docid_table_name, index1_table_name) log_manager.info_global(f"Found {len(url_records)} URLs to be converted. ") if len(url_records) > 0: ske_manager.create_session(ske_config) progressbar = progress.bar.Bar( 'Converting URLs to docid', max=len(url_records), suffix='%(index)d/%(max)d done, ETA: %(eta_td)s h') for record in url_records: url = record['url'] pos = ske_manager.get_pos_from_url(url) docid = ske_manager.get_docid_from_pos( ske_config, pos) # this calls the API endpoing 'fullref' insert_into_table(db_connection, db_cursor, docid_table_name, docid, pos, url) progressbar.next() progressbar.finish() # Direction 2: look for docids that are not yet in the translation table log_manager.debug_global("Looking for docids ...") docid_records = select_docids_from_index2(db_cursor, docid_table_name, index2_table_names) log_manager.debug_global( f"Found {len(docid_records)} docids to be converted.") if len(docid_records) > 0: ske_manager.create_session(ske_config) progressbar = progress.bar.Bar( 'Converting docids to URLs', max=len(docid_records), suffix='%(index)d/%(max)d done, ETA: %(eta_td)s h') for record in docid_records: docid = record['docid'] pos = ske_manager.get_pos_from_docid( ske_config, docid) # this calls the API endpoint 'first' url = ske_manager.get_url_from_pos(ske_config, pos) insert_into_table(db_connection, db_cursor, docid_table_name, docid, pos, url) progressbar.next() progressbar.finish() # All set! ske_manager.close_session() db_manager.close_db_connection(db_connection, db_cursor) return