Example #1
0
def test_indexing_non_html(session, mock_s3):
    sqla_url, crawl_req, crawl_resp = make_crawl_with_response(session)
    crawl_resp.headers["content-type"] = "application/pdf"  # type: ignore

    indexing.index(session, crawl_req.crawl_uuid)

    fulltext_count = (session.query(FullText).filter(
        FullText.crawl_uuid == crawl_req.crawl_uuid).count())
    assert fulltext_count == 0
Example #2
0
def test_indexing_for_fresh(session, mock_s3):
    sqla_url, crawl_req, crawl_resp = make_crawl_with_response(session)

    indexing.index(session, crawl_req.crawl_uuid)

    fulltext_obj = session.query(FullText).get(sqla_url.url_uuid)
    assert fulltext_obj.url_uuid == sqla_url.url_uuid
    assert fulltext_obj.crawl_uuid == crawl_req.crawl_uuid
    assert fulltext_obj.inserted == datetime(2018, 1, 3, tzinfo=timezone.utc)
    assert len(fulltext_obj.tsvector.split(" ")) == 10
    assert len(fulltext_obj.full_text) > 0
Example #3
0
def test_indexing_with_content_type_problems(session, mock_s3, headers):
    sqla_url, crawl_req, crawl_resp = make_crawl_with_response(session)
    crawl_resp.headers = headers

    indexing.index(session, crawl_req.crawl_uuid)

    fulltext_obj = session.query(FullText).get(sqla_url.url_uuid)
    assert fulltext_obj.url_uuid == sqla_url.url_uuid
    assert fulltext_obj.crawl_uuid == crawl_req.crawl_uuid
    assert fulltext_obj.inserted == datetime(2018, 1, 3, tzinfo=timezone.utc)
    assert len(fulltext_obj.tsvector.split(" ")) == 10
    assert len(fulltext_obj.full_text) > 0
Example #4
0
def test_index_throws_an_error(session, mock_s3):
    sqla_url, crawl_req, crawl_resp = make_crawl_with_response(session)
    session.commit()

    # First time, error thrown and recorded
    with mock.patch.object(indexing, "extract_metadata_from_html") as mock_gmd:
        mock_gmd.side_effect = RuntimeError
        indexing.index(session, crawl_req.crawl_uuid)

    error_count = (session.query(IndexingError).filter(
        IndexingError.crawl_uuid == crawl_req.crawl_uuid).count())
    assert error_count == 1

    # Second time, it's skipped
    indexing.index(session, crawl_req.crawl_uuid)
    assert error_count == 1
Example #5
0
def on_index_requested(message: PickleMessage, ctx: missive.HandlingContext):
    event = cast(IndexRequested, message.get_obj())
    session = get_session(ctx)
    metadata = indexing.index(session, event.crawl_uuid)
    if metadata:
        icon_message = icon_message_if_necessary(session, metadata)
    else:
        icon_message = None
    session.commit()
    ctx.ack()
    if icon_message:
        publish_message(icon_message, environ["QM_RABBITMQ_BG_WORKER_TOPIC"])
Example #6
0
def test_indexing_idempotent(session, mock_s3):
    sqla_url, crawl_req, crawl_resp = make_crawl_with_response(session)
    fulltext = FullText(
        url_uuid=sqla_url.url_uuid,
        crawl_uuid=crawl_req.crawl_uuid,
        inserted=datetime(2018, 1, 3, tzinfo=timezone.utc),
        full_text="hello world",
        tsvector=func.to_tsvector("hello world"),
    )

    session.add(fulltext)
    session.commit()

    indexing.index(session, crawl_req.crawl_uuid)

    fulltext_count = (session.query(FullText).filter(
        FullText.url_uuid == sqla_url.url_uuid).count())
    assert fulltext_count == 1

    error_count = (session.query(IndexingError).filter(
        IndexingError.crawl_uuid == crawl_req.crawl_uuid).count())
    assert error_count == 0