Ejemplo n.º 1
0
    def test_with_one_matching_Document(self):
        # One Document with a non-matching DocumentURI pointing to it.
        # find_by_uris() should not return this Document.
        document1 = document.Document()
        uri1 = 'https://de.wikipedia.org/wiki/Hauptseite'
        document1.document_uris.append(
            document.DocumentURI(claimant=uri1, uri=uri1))

        # A second Document with one matching and one non-matching DocumentURI
        # pointing to it. find_by_uris() should return this Document.
        document2 = document.Document()
        uri2 = 'https://en.wikipedia.org/wiki/Main_Page'
        document2.document_uris.append(
            document.DocumentURI(claimant=uri2, uri=uri2))
        uri3 = 'https://en.wikipedia.org'
        document2.document_uris.append(
            document.DocumentURI(claimant=uri3, uri=uri2))

        db.Session.add_all([document1, document2])
        db.Session.flush()

        actual = document.Document.find_by_uris(db.Session, [
            'https://en.wikipedia.org/wiki/Main_Page',
            'https://m.en.wikipedia.org/wiki/Main_Page'
        ])

        assert actual.count() == 1
        assert actual.first() == document2
Ejemplo n.º 2
0
    def test_with_one_existing_Document(self):
        """
        When there's one matching Document it should return that Document.

        When searching with two URIs that match two DocumentURIs that both
        point to the same Document, it should return that Document.

        """
        document_ = document.Document()
        docuri1 = document.DocumentURI(
            claimant='https://en.wikipedia.org/wiki/Main_Page',
            uri='https://en.wikipedia.org/wiki/Main_Page',
            document=document_)
        docuri2 = document.DocumentURI(
            claimant=
            'https://en.wikipedia.org/wiki/http/en.m.wikipedia.org/wiki/Main_Page',
            uri='https://en.wikipedia.org/wiki/Main_Page',
            document=document_)

        db.Session.add(docuri1)
        db.Session.add(docuri2)
        db.Session.flush()

        actual = document.Document.find_or_create_by_uris(
            db.Session, 'https://en.wikipedia.org/wiki/Main_Page', [
                'https://en.wikipedia.org/wiki/http/en.m.wikipedia.org/wiki/Main_Page',
                'https://m.en.wikipedia.org/wiki/Main_Page'
            ])

        assert actual.count() == 1
        assert actual.first() == document_
Ejemplo n.º 3
0
    def merge_data(self, request):
        master = document.Document(
            document_uris=[
                document.DocumentURI(
                    claimant='https://en.wikipedia.org/wiki/Main_Page',
                    uri='https://en.wikipedia.org/wiki/Main_Page',
                    type='self-claim')
            ],
            meta=[
                document.DocumentMeta(
                    claimant='https://en.wikipedia.org/wiki/Main_Page',
                    type='title',
                    value='Wikipedia, the free encyclopedia')
            ])
        duplicate = document.Document(
            document_uris=[
                document.DocumentURI(
                    claimant='https://m.en.wikipedia.org/wiki/Main_Page',
                    uri='https://en.wikipedia.org/wiki/Main_Page',
                    type='rel-canonical')
            ],
            meta=[
                document.DocumentMeta(
                    claimant='https://m.en.wikipedia.org/wiki/Main_Page',
                    type='title',
                    value='Wikipedia, the free encyclopedia')
            ])

        db.Session.add_all([master, duplicate])
        db.Session.flush()
        return (master, duplicate)
Ejemplo n.º 4
0
    def test_it_updates_the_existing_DocumentURI_if_there_is_one(self):
        claimant = 'http://example.com/example_claimant.html'
        uri = 'http://example.com/example_uri.html'
        type_ = 'self-claim'
        content_type = None
        document_ = document.Document()
        created = yesterday()
        updated = yesterday()
        document_uri = document.DocumentURI(
            claimant=claimant,
            uri=uri,
            type=type_,
            content_type=content_type,
            document=document_,
            created=created,
            updated=updated,
        )
        db.Session.add(document_uri)

        now_ = now()
        document.create_or_update_document_uri(
            session=db.Session,
            claimant=claimant,
            uri=uri,
            type=type_,
            content_type=content_type,
            document=document_,
            created=now_,
            updated=now_,
        )

        assert document_uri.created == created
        assert document_uri.updated == now_
        assert len(db.Session.query(document.DocumentURI).all()) == 1, (
            "It shouldn't have added any new objects to the db")
Ejemplo n.º 5
0
    def test_with_no_existing_documents(self):
        """When there are no matching Documents it creates and returns one."""
        document_ = document.Document()
        docuri = document.DocumentURI(
            claimant='https://en.wikipedia.org/wiki/Main_Page',
            uri='https://en.wikipedia.org/wiki/Main_Page',
            document=document_)

        db.Session.add(docuri)
        db.Session.flush()

        documents = document.Document.find_or_create_by_uris(
            db.Session, 'https://en.wikipedia.org/wiki/Pluto',
            ['https://m.en.wikipedia.org/wiki/Pluto'])

        assert documents.count() == 1

        actual = documents.first()
        assert isinstance(actual, document.Document)
        assert len(actual.document_uris) == 1

        docuri = actual.document_uris[0]
        assert docuri.claimant == 'https://en.wikipedia.org/wiki/Pluto'
        assert docuri.uri == 'https://en.wikipedia.org/wiki/Pluto'
        assert docuri.type == 'self-claim'
Ejemplo n.º 6
0
    def test_no_matches(self):
        document_ = document.Document()
        document_.document_uris.append(document.DocumentURI(
            claimant='https://en.wikipedia.org/wiki/Main_Page',
            uri='https://en.wikipedia.org/wiki/Main_Page'))
        db.Session.add(document_)
        db.Session.flush()

        actual = document.Document.find_by_uris(
            db.Session, ['https://de.wikipedia.org/wiki/Hauptseite'])
        assert actual.count() == 0
Ejemplo n.º 7
0
    def test_it_creates_a_new_DocumentURI_if_there_is_no_existing_one(self):
        claimant = 'http://example.com/example_claimant.html'
        uri = 'http://example.com/example_uri.html'
        type_ = 'self-claim'
        content_type = None
        document_ = document.Document()
        created = yesterday()
        updated = yesterday()

        # Add one non-matching DocumentURI to the database.
        db.Session.add(
            document.DocumentURI(
                claimant=claimant,
                uri=uri,
                type=type_,
                # Different content_type means this DocumentURI should not match
                # the query.
                content_type='different',
                document=document_,
                created=created,
                updated=updated,
            ))

        document.create_or_update_document_uri(
            session=db.Session,
            claimant=claimant,
            uri=uri,
            type=type_,
            content_type=content_type,
            document=document_,
            created=now(),
            updated=now(),
        )

        document_uri = db.Session.query(document.DocumentURI).all()[-1]
        assert document_uri.claimant == claimant
        assert document_uri.uri == uri
        assert document_uri.type == type_
        assert document_uri.content_type == content_type
        assert document_uri.document == document_
        assert document_uri.created > created
        assert document_uri.updated > updated