コード例 #1
0
def test_indexing_for_fresh(session, mock_s3):
    url_str = "http://example.com/" + random_string()
    scheme, netloc, urlpath, query, fragment = urlsplit(url_str)
    crawl_uuid = uuid4()
    url_uuid = sut.URL.from_string(url_str).url_uuid
    body_uuid = uuid4()

    url_obj = sut.SQLAUrl(
        url_uuid=url_uuid,
        scheme=scheme,
        netloc=netloc,
        path=urlpath,
        query=query,
        fragment=fragment,
    )
    crawl_req = sut.CrawlRequest(
        crawl_uuid=crawl_uuid,
        url_uuid=url_uuid,
        requested=datetime(2018, 1, 3),
        got_response=True,
    )
    crawl_resp = sut.CrawlResponse(
        crawl_uuid=crawl_uuid,
        headers={"content-type": "text/html"},
        body_uuid=body_uuid,
        status_code=200,
    )

    session.add_all([url_obj, crawl_req, crawl_resp])
    session.commit()

    bucket = sut.get_response_body_bucket()
    with open(path.join(test_data_path, "simple-website.html"),
              "rb") as html_f:
        sut.upload_file(bucket, html_f, str(body_uuid))

    sut.ensure_fulltext(crawl_uuid)

    fulltext_obj = session.query(sut.FullText).get(url_uuid)
    assert fulltext_obj.url_uuid == url_uuid
    assert fulltext_obj.crawl_uuid == crawl_uuid
    assert fulltext_obj.inserted == datetime(2018, 1, 3, tzinfo=timezone.utc)
    assert len(fulltext_obj.tsvector.split(" ")) == 6
    assert len(fulltext_obj.full_text) > 0
コード例 #2
0
def test_indexing_idempotent(session, mock_s3):
    url_str = "http://example.com/" + random_string()
    scheme, netloc, urlpath, query, fragment = urlsplit(url_str)
    crawl_uuid = uuid4()
    url_uuid = sut.URL.from_string(url_str).url_uuid
    body_uuid = uuid4()

    url_obj = sut.SQLAUrl(
        url_uuid=url_uuid,
        scheme=scheme,
        netloc=netloc,
        path=urlpath,
        query=query,
        fragment=fragment,
    )
    crawl_req = sut.CrawlRequest(
        crawl_uuid=crawl_uuid,
        url_uuid=url_uuid,
        requested=datetime(2018, 1, 3),
        got_response=True,
    )
    crawl_resp = sut.CrawlResponse(
        crawl_uuid=crawl_uuid,
        headers={"content-type": "text/html"},
        body_uuid=body_uuid,
        status_code=200,
    )
    fulltext = sut.FullText(
        url_uuid=url_uuid,
        crawl_uuid=crawl_uuid,
        inserted=datetime(2018, 1, 3, tzinfo=timezone.utc),
        full_text="hello world",
        tsvector=func.to_tsvector("hello world"),
    )

    session.add_all([url_obj, crawl_req, crawl_resp, fulltext])
    session.commit()

    sut.ensure_fulltext(crawl_uuid)

    fulltext_count = (session.query(
        sut.FullText).filter(sut.FullText.url_uuid == url_uuid).count())
    assert fulltext_count == 1
コード例 #3
0
def test_indexing_non_html(session):
    url_str = "http://example.com/" + random_string()
    scheme, netloc, urlpath, query, fragment = urlsplit(url_str)
    crawl_uuid = uuid4()
    url_uuid = sut.URL.from_string(url_str).url_uuid
    body_uuid = uuid4()

    url_obj = sut.SQLAUrl(
        url_uuid=url_uuid,
        scheme=scheme,
        netloc=netloc,
        path=urlpath,
        query=query,
        fragment=fragment,
    )
    crawl_req = sut.CrawlRequest(
        crawl_uuid=crawl_uuid,
        url_uuid=url_uuid,
        requested=datetime(2018, 1, 3),
        got_response=True,
    )
    crawl_resp = sut.CrawlResponse(
        crawl_uuid=crawl_uuid,
        headers={"content-type": "application/pdf"},
        body_uuid=body_uuid,
        status_code=200,
    )

    session.add_all([url_obj, crawl_req, crawl_resp])
    session.commit()

    sut.ensure_fulltext(crawl_uuid)

    fulltext_count = (session.query(
        sut.FullText).filter(sut.FullText.crawl_uuid == crawl_uuid).count())
    assert fulltext_count == 0