Beispiel #1
0
def run():

    db_config = {
        "host": ConfigRoot.db_host,
        "dbname": ConfigRoot.db_name,
        "user": ConfigRoot.db_user,
        "password": ConfigRoot.db_password,
        "port": ConfigRoot.db_port
    }
    db_connection, db_cursor = db_manager.open_db_connection(db_config)

    table_name_ref_articles_old = ConfigIndexBase.table_name_ref_articles
    table_name_ref_articles_new = table_name_ref_articles_old + "_random_subset"
    ConfigIndexBase.table_name_ref_articles = table_name_ref_articles_new

    create_main_reference_table(
        db_cursor, db_connection, table_name_ref_articles_old,
        table_name_ref_articles_new)  # TODO: comment this out if not needed

    for i in range(100):

        if i == 0:
            trainer1 = None
            trainer2 = None

        trainer1, trainer2 = train(
            trainer1, trainer2)  # TODO: comment this out if not needed
        drop_index(db_cursor,
                   db_connection)  # TODO: comment this out if not needed
        create_index(trainer1,
                     trainer2)  # TODO: comment this out if not needed
        compare_index(db_cursor, i)  # TODO: comment this out if not needed

    db_manager.close_db_connection(db_connection, db_cursor)
Beispiel #2
0
def stream_from_db_with_predictions(ske_config, db_config, index_table_name):

    log_manager.debug_global("Streaming from DB with predictions ...")

    db_connection = None
    db_cursor = None

    (db_connection,
     db_cursor) = db_manager.open_db_connection(db_config, db_connection,
                                                db_cursor)

    try:

        while True:

            db_cursor.execute(
                sql.SQL(
                    'SELECT *, ("AF: Social Companions" + "AF: Soziale Medien") AS AF_SC_SM '
                    'FROM {index_table_name} '
                    'WHERE already_annotated = FALSE '
                    'AND already_selected = FALSE '  # left-over from the old system
                    "AND ((selected_on IS NULL) OR (selected_on < (NOW() - INTERVAL '2 days'))) "
                    'ORDER BY AF_SC_SM ASC '
                    'LIMIT 1').format(
                        index_table_name=sql.Identifier(index_table_name)))

            result = db_cursor.fetchone()

            url = result["url"]

            _select_text(db_connection, db_cursor, index_table_name, 'url',
                         url)

            options = _preselect_options(result)

            ske_doc = ske_manager.get_doc_from_url(ske_config, url)

            yield {
                "text": ske_doc["text"],
                "options": options['cats_as_options'],
                "accept": options['options_accepted'],
                "meta": {
                    "url": url,
                    "scores": options['scores_text']
                }
            }

    except Exception as ex:
        print(ex)

    finally:
        db_manager.close_db_connection(db_connection, db_cursor)
Beispiel #3
0
    def _main():

        try:

            pos_start = _init_index_table()
            _populate_index_table(pos_start)

            db_connection.commit()
            ske_manager.close_session()

        except Exception as e:

            db_connection.rollback()
            ske_manager.close_session()
            raise e

        finally:
            db_manager.close_db_connection(db_connection, db_cursor)
def create_tables(db_config, index1_table_name, index2_table_names):

    (db_connection, db_cursor) = db_manager.open_db_connection(db_config)

    try:

        log_manager.debug_global("Dropping tables ...")
        db_cursor.execute(
            sql.SQL("""
                DROP TABLE IF EXISTS {table_keywords}, {table_scores}, {table_tokens} CASCADE;
                DROP INDEX IF EXISTS {score_idx} CASCADE;
            """).format(
                table_keywords=sql.Identifier(index2_table_names['keywords']),
                table_scores=sql.Identifier(index2_table_names['scores']),
                table_tokens=sql.Identifier(index2_table_names['tokens']),
                score_idx=sql.Identifier(
                    'index_2__mara002__lmvr_scores_score_rarity_diversity_idx')
            ))

        # table 1: keywords
        log_manager.debug_global(
            f"Creating table {index2_table_names['keywords']} ...")

        db_cursor.execute(
            sql.SQL("""
                CREATE TABLE {table} (
                    {pk} varchar NOT NULL,
                    corpus_count int4 NOT NULL,
                    category varchar NOT NULL,
                    CONSTRAINT index_2__mara002__lmvr_keywords_pk PRIMARY KEY ({pk})
                );
            """).format(table=sql.Identifier(index2_table_names['keywords']),
                        pk=sql.Identifier('keyword_id')))

        # table 2: texts + scores
        log_manager.debug_global(
            f"Creating table {index2_table_names['scores']} ...")

        db_cursor.execute(
            sql.SQL("""
                CREATE TABLE {table} (
                    {pk} varchar NOT NULL,
                    {score1} numeric NOT NULL,
                    already_annotated bool NULL,
                    selected_on timestamptz NULL,
                    CONSTRAINT index_2__mara002__lmvr_scores_pk PRIMARY KEY ({pk})
                );
                CREATE INDEX index_2__mara002__lmvr_scores_score_rarity_diversity_idx
                    ON {table}
                    USING btree
                    ({score1} DESC);
            """).format(table=sql.Identifier(index2_table_names['scores']),
                        pk=sql.Identifier('docid'),
                        score1=sql.Identifier('score_rarity_diversity')))

        # table 3: keywords in texts
        log_manager.debug_global(
            f"Creating table {index2_table_names['tokens']} ...")

        db_cursor.execute(
            sql.SQL("""
                CREATE TABLE {table} (
                    {fk_texts} varchar NOT NULL,
                    {fk_kw} varchar NOT NULL,
                    token_count int4 NOT NULL DEFAULT 0,
                    CONSTRAINT index_2__mara002__lmvr_tokens_pk PRIMARY KEY ({fk_texts}, {fk_kw}),
                    CONSTRAINT index_2__mara002__lmvr_tokens_fk FOREIGN KEY ({fk_texts})
                        REFERENCES {table_texts}({fk_texts})
                        ON UPDATE CASCADE
                        ON DELETE CASCADE,
                    CONSTRAINT index_2__mara002__lmvr_tokens_fk_keyword FOREIGN KEY ({fk_kw})
                        REFERENCES {table_kw}({fk_kw})
                        ON UPDATE CASCADE
                        ON DELETE CASCADE
                );
            """).format(
                table=sql.Identifier(index2_table_names['tokens']),
                table_texts=sql.Identifier(index2_table_names['scores']),
                fk_texts=sql.Identifier('docid'),
                table_kw=sql.Identifier(index2_table_names['keywords']),
                fk_kw=sql.Identifier('keyword_id')))

        db_connection.commit()

    except Exception as e:

        db_connection.rollback()
        raise e

    finally:
        db_manager.close_db_connection(db_connection, db_cursor)

    return  # TODO: Is this empty return on purpose?
def run():

    # get the VR info
    eval_data_container = main.load_gold_data(ConfigLoadG1)
    eval_data_container_VR = main.transform_gold_data(ConfigTransformG1VR,
                                                      eval_data_container)
    df_VR = pd.DataFrame(
        data=[{
            "article_id": gdi.article_id,
            "VR=ja": gdi.cats['Verantwortungsreferenz'] == 1,
        } for gdi in eval_data_container_VR.gold_data_item_list])

    # get the AF info
    eval_data_container = main.load_gold_data(ConfigLoadG1)
    eval_data_container_AF = main.transform_gold_data(
        ConfigTransformG1AF_Part1, eval_data_container)
    #eval_data_container_AF = main.transform_gold_data(ConfigTransformG1AF_Part2, eval_data_container_AF)
    df_AF = pd.DataFrame(data=[{
        "article_id": gdi.article_id,
        "AF=SM": gdi.cats['AF: Soziale Medien'] == 1,
        "AF=SC": gdi.cats['AF: Social Companions'] == 1,
    } for gdi in eval_data_container_AF.gold_data_item_list])

    # for each text, read from the DB how many LM it contains
    db_connection, db_cursor = db_manager.open_db_connection(
        db_config={
            "host": credentials.db_host,
            "dbname": credentials.db_name,
            "user": credentials.db_user,
            "password": credentials.db_password,
            "port": credentials.db_port
        })

    db_cursor.execute(
        sql.SQL("""
            select 
                t.docid as id, 
                count(distinct t.keyword_id) as dist, 
                sum(t.token_count) as total
            from {table_name} as t 
            where t.docid = any( %(docid_list)s )
            group by t.docid
            order by t.docid asc
        """).format(
            table_name=sql.Identifier('index_2__mara002__lmvr_tokens')), {
                'docid_list': [
                    gdi.article_id
                    for gdi in eval_data_container.gold_data_item_list
                ],
            })
    results = db_cursor.fetchall()
    df_LM = pd.DataFrame(data=[{
        "article_id": r['id'],
        "LMs total": r['total'],
        "LMs distinct": r['dist'],
    } for r in results])

    # close db connection
    db_manager.close_db_connection(db_connection, db_cursor)

    # merge the 3 dataframes
    df = df_LM.merge(df_AF, how='outer', on='article_id')
    df = df.merge(df_VR, how='outer', on='article_id')
    # the LM table in the db doesn't contain all texts, so we have NaN values. Replace those with 0.
    df['LMs total'] = df['LMs total'].fillna(0)
    df['LMs distinct'] = df['LMs distinct'].fillna(0)

    # define shortcuts to filter the dataframe
    maskAF = (df['AF=SC'] == True) | (df['AF=SM'] == True)
    maskVR = (df['VR=ja'] == True)

    main.log_manager.info_global(
        "--------------------------------\n"
        "Calculations complete. \n"
        "You can now access the DataFrame as `df`. \n"
        "There are 2 masks provided as `maskAF` (SC or SM) and `maskVR` (trivial). \n"
    )

    # usage example:
    # df[maskAF & maskVR]
    # df[~maskVR]

    embed()
        def on_exit(controller):

            log_manager.debug_global("Prodigy: exiting ...")

            db_manager.close_db_connection(db_connection, db_cursor)
def run(ske_config,
        db_config,
        docid_table_name,
        index1_table_name,
        index2_table_names,
        should_drop_create_table=False):

    (db_connection, db_cursor) = db_manager.open_db_connection(db_config)

    if should_drop_create_table:

        create_table(db_connection, db_cursor, docid_table_name)

    # Direction 1: look for URLs that are not yet in the translation table

    # Hannes says that pos -> docid is faster than docid -> pos
    # because the SKE uses pos as internal indices

    log_manager.debug_global("Looking for URLs ...")
    url_records = select_urls_from_index1(db_cursor, docid_table_name,
                                          index1_table_name)
    log_manager.info_global(f"Found {len(url_records)} URLs to be converted. ")

    if len(url_records) > 0:

        ske_manager.create_session(ske_config)

        progressbar = progress.bar.Bar(
            'Converting URLs to docid',
            max=len(url_records),
            suffix='%(index)d/%(max)d done, ETA: %(eta_td)s h')

        for record in url_records:
            url = record['url']
            pos = ske_manager.get_pos_from_url(url)
            docid = ske_manager.get_docid_from_pos(
                ske_config, pos)  # this calls the API endpoing 'fullref'
            insert_into_table(db_connection, db_cursor, docid_table_name,
                              docid, pos, url)
            progressbar.next()

        progressbar.finish()

    # Direction 2: look for docids that are not yet in the translation table

    log_manager.debug_global("Looking for docids ...")
    docid_records = select_docids_from_index2(db_cursor, docid_table_name,
                                              index2_table_names)
    log_manager.debug_global(
        f"Found {len(docid_records)} docids to be converted.")

    if len(docid_records) > 0:

        ske_manager.create_session(ske_config)

        progressbar = progress.bar.Bar(
            'Converting docids to URLs',
            max=len(docid_records),
            suffix='%(index)d/%(max)d done, ETA: %(eta_td)s h')

        for record in docid_records:
            docid = record['docid']
            pos = ske_manager.get_pos_from_docid(
                ske_config, docid)  # this calls the API endpoint 'first'
            url = ske_manager.get_url_from_pos(ske_config, pos)
            insert_into_table(db_connection, db_cursor, docid_table_name,
                              docid, pos, url)
            progressbar.next()

        progressbar.finish()

    # All set!

    ske_manager.close_session()

    db_manager.close_db_connection(db_connection, db_cursor)

    return