def test_expand_uri_document_uris(self, db_session): document = Document(document_uris=[ DocumentURI(uri='http://foo.com/', claimant='http://bar.com'), DocumentURI(uri='http://bar.com/', claimant='http://bar.com'), ]) db_session.add(document) db_session.flush() assert storage.expand_uri(db_session, 'http://foo.com/') == [ 'http://foo.com/', 'http://bar.com/' ]
def test_expand_uri_postgres_document_uris(postgres_enabled): request = DummyRequest(db=db.Session) postgres_enabled.return_value = True document = Document(document_uris=[ DocumentURI(uri='http://foo.com/', claimant='http://bar.com'), DocumentURI(uri='http://bar.com/', claimant='http://bar.com'), ]) db.Session.add(document) db.Session.flush() assert storage.expand_uri( request, 'http://foo.com/') == ['http://foo.com/', 'http://bar.com/']
def test_expand_uri_document_doesnt_expand_canonical_uris( self, db_session): document = Document(document_uris=[ DocumentURI(uri='http://foo.com/', claimant='http://example.com'), DocumentURI(uri='http://bar.com/', claimant='http://example.com'), DocumentURI(uri='http://example.com/', type='rel-canonical', claimant='http://example.com'), ]) db_session.add(document) db_session.flush() assert storage.expand_uri( db_session, "http://example.com/") == ["http://example.com/"]
def test_document_not_found(annotation): document = Document(document_uris=[ DocumentURI(claimant='something-else', uri='something-else') ]) db.Session.add(document) db.Session.flush() assert annotation.document is None
def test_document(annotation): document = Document(document_uris=[ DocumentURI(claimant=annotation.target_uri, uri=annotation.target_uri) ]) db.Session.add(document) db.Session.flush() assert annotation.document == document
def test_expand_uri_postgres_document_doesnt_expand_canonical_uris( postgres_enabled): request = DummyRequest(db=db.Session) postgres_enabled.return_value = True document = Document(document_uris=[ DocumentURI(uri='http://foo.com/', claimant='http://example.com'), DocumentURI(uri='http://bar.com/', claimant='http://example.com'), DocumentURI(uri='http://example.com/', type='rel-canonical', claimant='http://example.com'), ]) db.Session.add(document) db.Session.flush() assert storage.expand_uri( request, "http://example.com/") == ["http://example.com/"]
def test_document_find_by_uris_no_matches(): document = Document() document.document_uris.append(DocumentURI( claimant='https://en.wikipedia.org/wiki/Main_Page', uri='https://en.wikipedia.org/wiki/Main_Page')) db.Session.add(document) db.Session.flush() actual = Document.find_by_uris(db.Session, ['https://de.wikipedia.org/wiki/Hauptseite']) assert actual.count() == 0
def test_document_find_by_uris(): document1 = Document() uri1 = 'https://de.wikipedia.org/wiki/Hauptseite' document1.document_uris.append(DocumentURI(claimant=uri1, uri=uri1)) document2 = Document() uri2 = 'https://en.wikipedia.org/wiki/Main_Page' document2.document_uris.append(DocumentURI(claimant=uri2, uri=uri2)) uri3 = 'https://en.wikipedia.org' document2.document_uris.append(DocumentURI(claimant=uri3, uri=uri2)) db.Session.add_all([document1, document2]) db.Session.flush() actual = Document.find_by_uris(db.Session, [ 'https://en.wikipedia.org/wiki/Main_Page', 'https://m.en.wikipedia.org/wiki/Main_Page']) assert actual.count() == 1 assert actual.first() == document2
def create_or_update_document_uri(es_docuri, pg_document): docuri = DocumentURI.query.filter( DocumentURI.claimant_normalized == es_docuri.claimant_normalized, DocumentURI.uri_normalized == es_docuri.uri_normalized, DocumentURI.type == es_docuri.type, DocumentURI.content_type == es_docuri.content_type).first() if docuri is None: docuri = DocumentURI(claimant=es_docuri.claimant, uri=es_docuri.uri, type=es_docuri.type, content_type=es_docuri.content_type, document=pg_document, created=es_docuri.created, updated=es_docuri.updated) Session.add(docuri) elif not docuri.document == pg_document: log.warn('Found DocumentURI with id {:d} does not match expected document with id {:d}', docuri.id, pg_document.id) docuri.updated = es_docuri.updated
def test_document_find_or_create_by_uris(): document = Document() docuri1 = DocumentURI( claimant='https://en.wikipedia.org/wiki/Main_Page', uri='https://en.wikipedia.org/wiki/Main_Page', document=document) docuri2 = DocumentURI( claimant='https://en.wikipedia.org/wiki/http/en.m.wikipedia.org/wiki/Main_Page', uri='https://en.wikipedia.org/wiki/Main_Page', document=document) db.Session.add(docuri1) db.Session.add(docuri2) db.Session.flush() actual = Document.find_or_create_by_uris(db.Session, 'https://en.wikipedia.org/wiki/Main_Page', ['https://en.wikipedia.org/wiki/http/en.m.wikipedia.org/wiki/Main_Page', 'https://m.en.wikipedia.org/wiki/Main_Page']) assert actual.count() == 1 assert actual.first() == document
def merge_data(request): master = Document(document_uris=[DocumentURI( claimant='https://en.wikipedia.org/wiki/Main_Page', uri='https://en.wikipedia.org/wiki/Main_Page', type='self-claim')], meta=[DocumentMeta( claimant='https://en.wikipedia.org/wiki/Main_Page', type='title', value='Wikipedia, the free encyclopedia')]) duplicate = Document(document_uris=[DocumentURI( claimant='https://m.en.wikipedia.org/wiki/Main_Page', uri='https://en.wikipedia.org/wiki/Main_Page', type='rel-canonical')], meta=[DocumentMeta( claimant='https://m.en.wikipedia.org/wiki/Main_Page', type='title', value='Wikipedia, the free encyclopedia')]) db.Session.add_all([master, duplicate]) db.Session.flush() return (master, duplicate)
def create_or_update_document_uri(es_docuri, pg_document): docuri = DocumentURI.query.filter( DocumentURI.claimant_normalized == es_docuri.claimant_normalized, DocumentURI.uri_normalized == es_docuri.uri_normalized, DocumentURI.type == es_docuri.type, DocumentURI.content_type == es_docuri.content_type).first() if docuri is None: docuri = DocumentURI(claimant=es_docuri.claimant, uri=es_docuri.uri, type=es_docuri.type, content_type=es_docuri.content_type, document=pg_document, created=es_docuri.created, updated=es_docuri.updated) Session.add(docuri) elif not docuri.document == pg_document: log.warn( 'Found DocumentURI with id %d does not match expected document with id %d', docuri.id, pg_document.id) docuri.updated = es_docuri.updated
def test_document_find_or_create_by_uris_no_results(): document = Document() docuri = DocumentURI( claimant='https://en.wikipedia.org/wiki/Main_Page', uri='https://en.wikipedia.org/wiki/Main_Page', document=document) db.Session.add(docuri) db.Session.flush() documents = Document.find_or_create_by_uris(db.Session, 'https://en.wikipedia.org/wiki/Pluto', ['https://m.en.wikipedia.org/wiki/Pluto']) assert documents.count() == 1 actual = documents.first() assert isinstance(actual, Document) assert len(actual.document_uris) == 1 docuri = actual.document_uris[0] assert docuri.claimant == 'https://en.wikipedia.org/wiki/Pluto' assert docuri.uri == 'https://en.wikipedia.org/wiki/Pluto' assert docuri.type == 'self-claim'