Example #1
0
def test_process_page():
    with open('test/fixtures/parade_page.json') as f:
        parade_page = json.load(f)
    with open('test/fixtures/parade_page_contexts.json') as f:
        parade_page_contexts = json.load(f)
    redirects_lookup = {}
    processed_page = pp.process_page(redirects_lookup, parade_page)
    assert processed_page['document_info']['title'] == parade_page['title']
    assert processed_page['document_info']['text'] == parade_page['plaintext']
    assert processed_page['document_info']['categories'] == parade_page[
        'categories']
    assert processed_page['link_contexts'] == parade_page_contexts
    assert processed_page['entity_counts'] == _.map_values(
        parade_page_contexts, len)
Example #2
0
def test_process_page_with_overlapping_mentions():
    page = {
        '_id':
        'Other',
        'pageID':
        0,
        'categories': [],
        'title':
        'Other',
        'isDisambiguation':
        False,
        'plaintext':
        'some Other text and my stuff',
        'sections': [{
            'sentences': [{
                'text':
                'some Other text and my stuff',
                'links': [{
                    'page': 'Other',
                    'text': 'some Other text'
                }, {
                    'page': 'My page',
                    'text': 'my'
                }]
            }]
        }]
    }
    redirects_lookup = {}
    processed_page = pp.process_page(redirects_lookup, page)
    assert processed_page['document_info']['title'] == 'Other'
    assert processed_page['document_info'][
        'text'] == 'some Other text and my stuff'
    assert processed_page['link_contexts'] == {
        'Other': [{
            'text': 'some Other text',
            'sentence': 'some Other text and my stuff',
            'offset': 0,
            'page_title': 'Other',
            'preredirect': 'Other'
        }],
        'My page': [{
            'text': 'my',
            'sentence': 'some Other text and my stuff',
            'offset': 20,
            'page_title': 'Other',
            'preredirect': 'My page'
        }]
    }
    assert processed_page['entity_counts'] == {'Other': 1, 'My page': 1}
Example #3
0
def test_process_page_with_redirects():
    with open('test/fixtures/parade_page.json') as f:
        parade_page = json.load(f)
    with open('test/fixtures/parade_page_contexts.json') as f:
        parade_page_contexts = json.load(f)
    redirects_lookup = {"Fort de Goede Hoop": "Kaapstad"}
    processed_page = pp.process_page(redirects_lookup, parade_page)
    assert processed_page['document_info']['title'] == parade_page['title']
    assert processed_page['document_info']['text'] == parade_page['plaintext']
    assert processed_page['document_info']['categories'] == parade_page[
        'categories']
    parade_page_contexts["Kaapstad"].insert(
        1,
        parade_page_contexts.pop("Fort de Goede Hoop")[0])
    assert processed_page['link_contexts'] == parade_page_contexts
    assert processed_page['entity_counts'] == _.map_values(
        parade_page_contexts, len)
Example #4
0
def test_process_page_with_implicit_links():
    page = {
        '_id':
        'My page',
        'pageID':
        0,
        'title':
        'My page',
        'categories': [],
        'plaintext':
        'some text',
        'isDisambiguation':
        False,
        'sections': [{
            'sentences': [{
                'text': 'some text',
                'links': [{
                    'page': 'some'
                }]
            }]
        }]
    }
    redirects_lookup = {}
    processed_page = pp.process_page(redirects_lookup, page)
    assert processed_page['document_info']['title'] == 'My page'
    assert processed_page['document_info']['text'] == 'some text'
    assert processed_page['document_info']['categories'] == []
    assert processed_page['link_contexts'] == {
        'My page': [],
        'Some': [{
            'text': 'some',
            'sentence': 'some text',
            'offset': 0,
            'page_title': 'My page',
            'preredirect': 'Some'
        }]
    }
    assert processed_page['entity_counts'] == {'My page': 0, 'Some': 1}