def test_you_cannot_set_content_type_to_null(self, db_session): document_uri = document.DocumentURI( claimant="http://www.example.com", uri="http://www.example.com", type="foo", content_type="bar", document=document.Document(), ) db_session.add(document_uri) db_session.flush() document_uri.content_type = None with pytest.raises(sa.exc.IntegrityError): db_session.flush()
def test_with_one_existing_Document(self, db_session): """ When there's one matching Document it should return that Document. When searching with two URIs that match two DocumentURIs that both point to the same Document, it should return that Document. """ document_ = document.Document() docuri1 = document.DocumentURI( claimant="https://en.wikipedia.org/wiki/Main_Page", uri="https://en.wikipedia.org/wiki/Main_Page", document=document_, ) docuri2 = document.DocumentURI( claimant= "https://en.wikipedia.org/wiki/http/en.m.wikipedia.org/wiki/Main_Page", uri="https://en.wikipedia.org/wiki/Main_Page", document=document_, ) db_session.add(docuri1) db_session.add(docuri2) db_session.flush() actual = document.Document.find_or_create_by_uris( db_session, "https://en.wikipedia.org/wiki/Main_Page", [ "https://en.wikipedia.org/wiki/http/en.m.wikipedia.org/wiki/Main_Page", "https://m.en.wikipedia.org/wiki/Main_Page", ], ) assert actual.count() == 1 assert actual.first() == document_
def test_it_creates_a_new_DocumentURI_if_there_is_no_existing_one( self, db_session): claimant = "http://example.com/example_claimant.html" uri = "http://example.com/example_uri.html" type_ = "self-claim" content_type = "" document_ = document.Document() created = yesterday() updated = yesterday() # Add one non-matching DocumentURI to the database. db_session.add( document.DocumentURI( claimant=claimant, uri=uri, type=type_, # Different content_type means this DocumentURI should not match # the query. content_type="different", document=document_, created=created, updated=updated, )) document.create_or_update_document_uri( session=db_session, claimant=claimant, uri=uri, type=type_, content_type=content_type, document=document_, created=now(), updated=now(), ) document_uri = (db_session.query(document.DocumentURI).order_by( document.DocumentURI.created.desc()).first()) assert document_uri.claimant == claimant assert document_uri.uri == uri assert document_uri.type == type_ assert document_uri.content_type == content_type assert document_uri.document == document_ assert document_uri.created > created assert document_uri.updated > updated
def test_it_creates_a_new_DocumentURI_if_there_is_no_existing_one( self, db_session): claimant = 'http://example.com/example_claimant.html' uri = 'http://example.com/example_uri.html' type_ = 'self-claim' content_type = '' document_ = document.Document() created = yesterday() updated = yesterday() # Add one non-matching DocumentURI to the database. db_session.add( document.DocumentURI( claimant=claimant, uri=uri, type=type_, # Different content_type means this DocumentURI should not match # the query. content_type='different', document=document_, created=created, updated=updated, )) document.create_or_update_document_uri( session=db_session, claimant=claimant, uri=uri, type=type_, content_type=content_type, document=document_, created=now(), updated=now(), ) document_uri = db_session.query(document.DocumentURI).all()[-1] assert document_uri.claimant == claimant assert document_uri.uri == uri assert document_uri.type == type_ assert document_uri.content_type == content_type assert document_uri.document == document_ assert document_uri.created > created assert document_uri.updated > updated
def test_it_updates_the_existing_DocumentURI_if_there_is_one( self, db_session): claimant = "http://example.com/example_claimant.html" uri = "http://example.com/example_uri.html" type_ = "self-claim" content_type = "" document_ = document.Document() created = yesterday() updated = yesterday() document_uri = document.DocumentURI( claimant=claimant, uri=uri, type=type_, content_type=content_type, document=document_, created=created, updated=updated, ) db_session.add(document_uri) now_ = now() document.create_or_update_document_uri( session=db_session, claimant=claimant, uri=uri, type=type_, content_type=content_type, document=document_, created=now_, updated=now_, ) assert document_uri.created == created assert document_uri.updated == now_ assert (len(db_session.query(document.DocumentURI).all()) == 1 ), "It shouldn't have added any new objects to the db"
def merge_data(self, db_session, request): master = document.Document( document_uris=[ document.DocumentURI( claimant="https://en.wikipedia.org/wiki/Main_Page", uri="https://en.wikipedia.org/wiki/Main_Page", type="self-claim", ) ], meta=[ document.DocumentMeta( claimant="https://en.wikipedia.org/wiki/Main_Page", type="title", value="Wikipedia, the free encyclopedia", ) ], ) duplicate_1 = document.Document( document_uris=[ document.DocumentURI( claimant="https://m.en.wikipedia.org/wiki/Main_Page", uri="https://en.wikipedia.org/wiki/Main_Page", type="rel-canonical", ) ], meta=[ document.DocumentMeta( claimant="https://m.en.wikipedia.org/wiki/Main_Page", type="title", value="Wikipedia, the free encyclopedia", ) ], ) duplicate_2 = document.Document( document_uris=[ document.DocumentURI( claimant="https://en.wikipedia.org/wiki/Home", uri="https://en.wikipedia.org/wiki/Main_Page", type="rel-canonical", ) ], meta=[ document.DocumentMeta( claimant="https://en.wikipedia.org/wiki/Home", type="title", value="Wikipedia, the free encyclopedia", ) ], ) db_session.add_all([master, duplicate_1, duplicate_2]) db_session.flush() master_ann_1 = models.Annotation(userid="luke", document_id=master.id) master_ann_2 = models.Annotation(userid="alice", document_id=master.id) duplicate_1_ann_1 = models.Annotation(userid="lucy", document_id=duplicate_1.id) duplicate_1_ann_2 = models.Annotation(userid="bob", document_id=duplicate_1.id) duplicate_2_ann_1 = models.Annotation(userid="amy", document_id=duplicate_2.id) duplicate_2_ann_2 = models.Annotation(userid="dan", document_id=duplicate_2.id) db_session.add_all([ master_ann_1, master_ann_2, duplicate_1_ann_1, duplicate_1_ann_2, duplicate_2_ann_1, duplicate_2_ann_2, ]) return (master, duplicate_1, duplicate_2)
def test_it_normalizes_the_uri(self): document_uri = document.DocumentURI(uri="http://example.com/") assert document_uri.uri_normalized == "httpx://example.com"