def main(): load_dotenv(dotenv_path='.env') EL_DATABASE_NAME = os.getenv("EL_DBNAME") DATABASE_USER = os.getenv("DBUSER") DATABASE_PASSWORD = os.getenv("DBPASS") DATABASE_HOST = os.getenv("DBHOST") connection = pymysql.connect(host=DATABASE_HOST, user=DATABASE_USER, password=DATABASE_PASSWORD, db=EL_DATABASE_NAME, charset='utf8mb4', use_unicode=True, cursorclass=pymysql.cursors.DictCursor) try: with connection.cursor() as pages_cursor: pages_cursor.execute("SET NAMES utf8mb4;") pages_cursor.execute("SET CHARACTER SET utf8mb4;") pages_cursor.execute("SET character_set_connection=utf8mb4;") with connection.cursor() as mentions_cursor: mentions_cursor.execute("SET NAMES utf8mb4;") mentions_cursor.execute("SET CHARACTER SET utf8mb4;") mentions_cursor.execute( "SET character_set_connection=utf8mb4;") pages, page_count = get_nondisambiguation_pages_having_mentions( pages_cursor) for page in progressbar(pages, max_value=page_count): page_id = page['id'] sorted_mentions = get_page_mentions_by_entity( mentions_cursor, page_id) mention_link_titles = _.pluck(sorted_mentions, 'entity') page_iobes = get_page_iobes(page, sorted_mentions, mention_link_titles) write_page_iobes(page, page_iobes) finally: connection.close()
def test_get_page_iobes_overlapping_matches(): page = {'source_id': 0, 'title': 'Other', 'content': 'some other text and my stuff'} mentions = [{'text': 'some other text', 'offset': 0, 'page_title': 'Other'}, {'text': 'my', 'offset': 20, 'page_title': 'My page'}] mention_link_titles = ['Other', 'My page'] assert [[['some', 'Other', 'B'], ['other', 'Other', 'I'], ['text', 'Other', 'E'], ['and', 'O'], ['my', 'My%20page', 'S'], ['stuff', 'O']]] == iobes.get_page_iobes(page, mentions, mention_link_titles)
def test_get_page_iobes_straddling_mention(): page = {'content': '2002–03 NHL. season', 'source_id': 0, 'title': '2002–03 Buffalo Sabres season'} page_contexts = {'2002–03 NHL season': [{'text': '2002–03 NHL. season', 'offset': 0, 'page_title': '2002–03 Buffalo Sabres season'}]} mentions = list(page_contexts.values())[0] mention_link_titles = ['2002–03 NHL season'] page_iobes = [[['2002–03' , '2002%E2%80%9303%20NHL%20season', 'B'], ['NHL' , '2002%E2%80%9303%20NHL%20season', 'I'], ['.' , '2002%E2%80%9303%20NHL%20season', 'I'], ['season' , '2002%E2%80%9303%20NHL%20season', 'E']]] assert page_iobes == iobes.get_page_iobes(page, mentions, mention_link_titles)
def test_get_page_iobes(): with open('test/fixtures/parade_page_db.json') as f: parade_page = json.load(f) with open('test/fixtures/parade_page_contexts.json') as f: filter_out_of_bounds = lambda mention: mention['offset'] < len(parade_page['content']) parade_page_contexts = _.map_values(json.load(f), lambda mentions: list(filter(filter_out_of_bounds, mentions))) context_pairs = _.mapcat(_.to_pairs(parade_page_contexts), lambda pair: [[pair[0], mention] for mention in pair[1]]) contexts = _.sort_by(context_pairs, lambda title_mention: title_mention[1]['offset']) mentions = _.flat_map(contexts, _.last) mention_link_titles = list(map(_.head, contexts)) assert parade_iobes == iobes.get_page_iobes(parade_page, mentions, mention_link_titles)