def test_process_page(): with open('test/fixtures/parade_page.json') as f: parade_page = json.load(f) with open('test/fixtures/parade_page_contexts.json') as f: parade_page_contexts = json.load(f) redirects_lookup = {} processed_page = pp.process_page(redirects_lookup, parade_page) assert processed_page['document_info']['title'] == parade_page['title'] assert processed_page['document_info']['text'] == parade_page['plaintext'] assert processed_page['document_info']['categories'] == parade_page[ 'categories'] assert processed_page['link_contexts'] == parade_page_contexts assert processed_page['entity_counts'] == _.map_values( parade_page_contexts, len)
def test_process_page_with_overlapping_mentions(): page = { '_id': 'Other', 'pageID': 0, 'categories': [], 'title': 'Other', 'isDisambiguation': False, 'plaintext': 'some Other text and my stuff', 'sections': [{ 'sentences': [{ 'text': 'some Other text and my stuff', 'links': [{ 'page': 'Other', 'text': 'some Other text' }, { 'page': 'My page', 'text': 'my' }] }] }] } redirects_lookup = {} processed_page = pp.process_page(redirects_lookup, page) assert processed_page['document_info']['title'] == 'Other' assert processed_page['document_info'][ 'text'] == 'some Other text and my stuff' assert processed_page['link_contexts'] == { 'Other': [{ 'text': 'some Other text', 'sentence': 'some Other text and my stuff', 'offset': 0, 'page_title': 'Other', 'preredirect': 'Other' }], 'My page': [{ 'text': 'my', 'sentence': 'some Other text and my stuff', 'offset': 20, 'page_title': 'Other', 'preredirect': 'My page' }] } assert processed_page['entity_counts'] == {'Other': 1, 'My page': 1}
def test_process_page_with_redirects(): with open('test/fixtures/parade_page.json') as f: parade_page = json.load(f) with open('test/fixtures/parade_page_contexts.json') as f: parade_page_contexts = json.load(f) redirects_lookup = {"Fort de Goede Hoop": "Kaapstad"} processed_page = pp.process_page(redirects_lookup, parade_page) assert processed_page['document_info']['title'] == parade_page['title'] assert processed_page['document_info']['text'] == parade_page['plaintext'] assert processed_page['document_info']['categories'] == parade_page[ 'categories'] parade_page_contexts["Kaapstad"].insert( 1, parade_page_contexts.pop("Fort de Goede Hoop")[0]) assert processed_page['link_contexts'] == parade_page_contexts assert processed_page['entity_counts'] == _.map_values( parade_page_contexts, len)
def test_process_page_with_implicit_links(): page = { '_id': 'My page', 'pageID': 0, 'title': 'My page', 'categories': [], 'plaintext': 'some text', 'isDisambiguation': False, 'sections': [{ 'sentences': [{ 'text': 'some text', 'links': [{ 'page': 'some' }] }] }] } redirects_lookup = {} processed_page = pp.process_page(redirects_lookup, page) assert processed_page['document_info']['title'] == 'My page' assert processed_page['document_info']['text'] == 'some text' assert processed_page['document_info']['categories'] == [] assert processed_page['link_contexts'] == { 'My page': [], 'Some': [{ 'text': 'some', 'sentence': 'some text', 'offset': 0, 'page_title': 'My page', 'preredirect': 'Some' }] } assert processed_page['entity_counts'] == {'My page': 0, 'Some': 1}