Beispiel #1
0
def test_document_find_by_uris_no_matches():
    document = Document()
    document.document_uris.append(DocumentURI(
        claimant='https://en.wikipedia.org/wiki/Main_Page',
        uri='https://en.wikipedia.org/wiki/Main_Page'))
    db.Session.add(document)
    db.Session.flush()

    actual = Document.find_by_uris(db.Session, ['https://de.wikipedia.org/wiki/Hauptseite'])
    assert actual.count() == 0
Beispiel #2
0
def test_document_title():
    doc = Document()
    DocumentMeta(type='title', value='The Title', document=doc, claimant='http://example.com')
    db.Session.add(doc)
    db.Session.flush()

    assert doc.title == 'The Title'
Beispiel #3
0
def test_document_title_meta_not_found():
    doc = Document()
    DocumentMeta(type='other', value='something', document=doc, claimant='http://example.com')
    db.Session.add(doc)
    db.Session.flush()

    assert doc.title is None
Beispiel #4
0
def create_or_update_document_objects(es_ann):
    es_doc = es_ann.document

    if not es_doc:
        return

    uris = [u.uri for u in es_doc.document_uris]
    documents = Document.find_or_create_by_uris(Session,
                                                es_ann.target_uri,
                                                uris,
                                                created=es_doc.created,
                                                updated=es_doc.updated)

    if documents.count() > 1:
        document = merge_documents(Session, documents, updated=es_doc.updated)
    else:
        document = documents.first()

    document.updated = es_doc.updated

    for uri_ in es_doc.document_uris:
        create_or_update_document_uri(uri_, document)

    for meta in es_doc.meta:
        create_or_update_document_meta(meta, document)
Beispiel #5
0
def expand_uri(request, uri):
    """
    Return all URIs which refer to the same underlying document as `uri`.

    This function determines whether we already have "document" records for the
    passed URI, and if so returns the set of all URIs which we currently
    believe refer to the same document.

    :param request: the request object
    :type request: pyramid.request.Request

    :param uri: a URI associated with the document
    :type id: str

    :returns: a list of equivalent URIs
    :rtype: list
    """
    doc = None
    if _postgres_enabled(request):
        doc = Document.find_by_uris(request.db, [uri]).one_or_none()
    else:
        doc = elastic.Document.get_by_uri(uri)

    if doc is None:
        return [uri]

    # We check if the match was a "canonical" link. If so, all annotations
    # created on that page are guaranteed to have that as their target.source
    # field, so we don't need to expand to other URIs and risk false positives.
    docuris = doc.document_uris
    for docuri in docuris:
        if docuri.uri == uri and docuri.type == 'rel-canonical':
            return [uri]

    return [docuri.uri for docuri in docuris]
Beispiel #6
0
def test_document(annotation):
    document = Document(document_uris=[
        DocumentURI(claimant=annotation.target_uri, uri=annotation.target_uri)
    ])
    db.Session.add(document)
    db.Session.flush()

    assert annotation.document == document
Beispiel #7
0
def test_document_not_found(annotation):
    document = Document(document_uris=[
        DocumentURI(claimant='something-else', uri='something-else')
    ])
    db.Session.add(document)
    db.Session.flush()

    assert annotation.document is None
Beispiel #8
0
def test_document_find_by_uris():
    document1 = Document()
    uri1 = 'https://de.wikipedia.org/wiki/Hauptseite'
    document1.document_uris.append(DocumentURI(claimant=uri1, uri=uri1))

    document2 = Document()
    uri2 = 'https://en.wikipedia.org/wiki/Main_Page'
    document2.document_uris.append(DocumentURI(claimant=uri2, uri=uri2))
    uri3 = 'https://en.wikipedia.org'
    document2.document_uris.append(DocumentURI(claimant=uri3, uri=uri2))

    db.Session.add_all([document1, document2])
    db.Session.flush()

    actual = Document.find_by_uris(db.Session, [
        'https://en.wikipedia.org/wiki/Main_Page',
        'https://m.en.wikipedia.org/wiki/Main_Page'])
    assert actual.count() == 1
    assert actual.first() == document2
Beispiel #9
0
    def test_expand_uri_document_uris(self, db_session):
        document = Document(document_uris=[
            DocumentURI(uri='http://foo.com/', claimant='http://bar.com'),
            DocumentURI(uri='http://bar.com/', claimant='http://bar.com'),
        ])
        db_session.add(document)
        db_session.flush()

        assert storage.expand_uri(db_session, 'http://foo.com/') == [
            'http://foo.com/', 'http://bar.com/'
        ]
Beispiel #10
0
def test_document_find_or_create_by_uris():
    document = Document()
    docuri1 = DocumentURI(
        claimant='https://en.wikipedia.org/wiki/Main_Page',
        uri='https://en.wikipedia.org/wiki/Main_Page',
        document=document)
    docuri2 = DocumentURI(
        claimant='https://en.wikipedia.org/wiki/http/en.m.wikipedia.org/wiki/Main_Page',
        uri='https://en.wikipedia.org/wiki/Main_Page',
        document=document)

    db.Session.add(docuri1)
    db.Session.add(docuri2)
    db.Session.flush()

    actual = Document.find_or_create_by_uris(db.Session,
        'https://en.wikipedia.org/wiki/Main_Page',
        ['https://en.wikipedia.org/wiki/http/en.m.wikipedia.org/wiki/Main_Page',
         'https://m.en.wikipedia.org/wiki/Main_Page'])
    assert actual.count() == 1
    assert actual.first() == document
Beispiel #11
0
def merge_data(request):
    master = Document(document_uris=[DocumentURI(
            claimant='https://en.wikipedia.org/wiki/Main_Page',
            uri='https://en.wikipedia.org/wiki/Main_Page',
            type='self-claim')],
            meta=[DocumentMeta(
                claimant='https://en.wikipedia.org/wiki/Main_Page',
                type='title',
                value='Wikipedia, the free encyclopedia')])
    duplicate = Document(document_uris=[DocumentURI(
            claimant='https://m.en.wikipedia.org/wiki/Main_Page',
            uri='https://en.wikipedia.org/wiki/Main_Page',
            type='rel-canonical')],
            meta=[DocumentMeta(
                claimant='https://m.en.wikipedia.org/wiki/Main_Page',
                type='title',
                value='Wikipedia, the free encyclopedia')])

    db.Session.add_all([master, duplicate])
    db.Session.flush()
    return (master, duplicate)
Beispiel #12
0
def test_og_document(render_app_html, annotation_document, document_title):
    annotation = Annotation(id='123', userid='foo', target_uri='http://example.com')
    document = Document()
    annotation_document.return_value = document
    document_title.return_value = 'WikiHow — How to Make a ☆Starmap☆'

    render_app_html.return_value = '<html></html>'
    request = _dummy_request()
    main.annotation_page(annotation, request)
    args, kwargs = render_app_html.call_args
    test = lambda d: 'foo' in d['content'] and 'Starmap' in d['content']
    assert any(test(d) for d in kwargs['extra']['meta_attrs'])
Beispiel #13
0
def test_expand_uri_postgres_document_uris(postgres_enabled):
    request = DummyRequest(db=db.Session)
    postgres_enabled.return_value = True

    document = Document(document_uris=[
        DocumentURI(uri='http://foo.com/', claimant='http://bar.com'),
        DocumentURI(uri='http://bar.com/', claimant='http://bar.com'),
    ])
    db.Session.add(document)
    db.Session.flush()

    assert storage.expand_uri(
        request, 'http://foo.com/') == ['http://foo.com/', 'http://bar.com/']
Beispiel #14
0
    def test_expand_uri_document_doesnt_expand_canonical_uris(
            self, db_session):
        document = Document(document_uris=[
            DocumentURI(uri='http://foo.com/', claimant='http://example.com'),
            DocumentURI(uri='http://bar.com/', claimant='http://example.com'),
            DocumentURI(uri='http://example.com/',
                        type='rel-canonical',
                        claimant='http://example.com'),
        ])
        db_session.add(document)
        db_session.flush()

        assert storage.expand_uri(
            db_session, "http://example.com/") == ["http://example.com/"]
Beispiel #15
0
def test_document_find_or_create_by_uris_no_results():
    document = Document()
    docuri = DocumentURI(
        claimant='https://en.wikipedia.org/wiki/Main_Page',
        uri='https://en.wikipedia.org/wiki/Main_Page',
        document=document)

    db.Session.add(docuri)
    db.Session.flush()

    documents = Document.find_or_create_by_uris(db.Session,
        'https://en.wikipedia.org/wiki/Pluto',
        ['https://m.en.wikipedia.org/wiki/Pluto'])

    assert documents.count() == 1

    actual = documents.first()
    assert isinstance(actual, Document)
    assert len(actual.document_uris) == 1

    docuri = actual.document_uris[0]
    assert docuri.claimant == 'https://en.wikipedia.org/wiki/Pluto'
    assert docuri.uri == 'https://en.wikipedia.org/wiki/Pluto'
    assert docuri.type == 'self-claim'
Beispiel #16
0
def test_expand_uri_postgres_document_doesnt_expand_canonical_uris(
        postgres_enabled):
    request = DummyRequest(db=db.Session)
    postgres_enabled.return_value = True

    document = Document(document_uris=[
        DocumentURI(uri='http://foo.com/', claimant='http://example.com'),
        DocumentURI(uri='http://bar.com/', claimant='http://example.com'),
        DocumentURI(uri='http://example.com/',
                    type='rel-canonical',
                    claimant='http://example.com'),
    ])
    db.Session.add(document)
    db.Session.flush()

    assert storage.expand_uri(
        request, "http://example.com/") == ["http://example.com/"]
Beispiel #17
0
def create_or_update_document_objects(es_ann):
    es_doc = es_ann.document

    if not es_doc:
        return

    uris = [u.uri for u in es_doc.document_uris]
    documents = Document.find_or_create_by_uris(Session, es_ann.target_uri, uris,
                                                created=es_doc.created,
                                                updated=es_doc.updated)

    if documents.count() > 1:
        document = merge_documents(Session, documents, updated=es_doc.updated)
    else:
        document = documents.first()

    document.updated = es_doc.updated

    for uri_ in es_doc.document_uris:
        create_or_update_document_uri(uri_, document)

    for meta in es_doc.meta:
        create_or_update_document_meta(meta, document)