Esempio n. 1
0
def test_execute_skipping(tmp_path: Path, caplog: LogCaptureFixture) -> None:
    batch_file = tmp_path / "batch.jsonl"
    results_dir = tmp_path / "out"

    # Execute request for the first time.
    batch = Batch()
    batch.append(Search("trump", max_tweets=50))
    batch.dump(batch_file)
    assert batch.execute(results_dir)
    _assert_results_dir_structure(results_dir, list(batch))

    batch_entry = batch[0]
    meta_file = results_dir / batch_entry.meta_file_name
    data_file = results_dir / batch_entry.data_file_name
    meta_stat1 = meta_file.stat()
    data_stat1 = data_file.stat()

    # Execute same request again (should be skipped).
    batch = Batch(
    )  # Recreate from dumped batch file so that batch entry IDs match.
    batch.load(batch_file)
    caplog.clear()
    assert batch.execute(results_dir)
    assert 1 == len(
        [record for record in caplog.records if "Skipping" in record.msg])
    _assert_results_dir_structure(results_dir, list(batch))
    meta_stat2 = meta_file.stat()
    data_stat2 = data_file.stat()

    # Verify that files were not modified.
    assert meta_stat1.st_mtime_ns == meta_stat2.st_mtime_ns
    assert data_stat1.st_mtime_ns == data_stat2.st_mtime_ns
Esempio n. 2
0
def test_execute_stray_data_file(tmp_path: Path,
                                 caplog: LogCaptureFixture) -> None:
    batch = Batch()
    batch.append(Search("trump", max_tweets=50))

    batch_entry = batch[0]
    data = "Just some stray data."
    data_file = tmp_path / batch_entry.data_file_name
    write_file(data_file, data)
    data_stat1 = data_file.stat()

    assert not batch.execute(tmp_path)

    # Assert exception is saved.
    assert batch_entry == read_json(tmp_path / batch_entry.meta_file_name,
                                    BatchEntry)
    assert batch_entry.exception is not None
    assert batch_entry.exception.type == "ValueError"
    batch_entry.exception = None

    # Assert that previous data file is not modified.
    data_stat2 = data_file.stat()
    assert data_stat1.st_mtime == data_stat2.st_mtime
    assert data == read_file(data_file)

    # Delete data file and verify that it works now.
    data_file.unlink()
    caplog.clear()
    assert batch.execute(tmp_path)
    assert 1 == len(
        [record for record in caplog.records if "Retrying" in record.msg])
    _assert_results_dir_structure(tmp_path, list(batch))
Esempio n. 3
0
def test_execute_success(tmp_path: Path) -> None:
    batch = Batch()
    batch.append(Search("trump", max_tweets=50))
    batch.append(Search("hillary", max_tweets=50))
    batch.append(Search("obama", max_tweets=50))
    assert batch.execute(tmp_path)
    _assert_results_dir_structure(tmp_path, list(batch))
Esempio n. 4
0
def test_execute_success_empty(tmp_path: Path) -> None:
    # Random string that currently does not match any Tweet.
    unknown_word = "c9dde8b5451149e683d4f07e4c4348ef"
    batch = Batch()
    batch.append(Search(unknown_word))
    results = batch.execute(tmp_path)
    assert results
    assert not list(results.tweets(results[0]))
    _assert_results_dir_structure(tmp_path, list(batch), allow_empty=True)
Esempio n. 5
0
def test_unidify_fail_and_restart(
    requests: Iterable[Request],
    settings: NastySettings,
    monkeypatch: MonkeyPatch,
    tmp_path: Path,
) -> None:
    idify_dir = tmp_path / "idify"
    unidify_dir = tmp_path / "unidify"

    batch = Batch()
    for request in requests:
        batch.append(request)
    results = batch.execute()
    assert results is not None

    tweets = {
        tweet.id: tweet
        for entry in results for tweet in results.tweets(entry)
    }
    tweets_truncated = dict(tweets)
    del tweets_truncated[TweetId("1115690615612825601")]

    idified = results.idify(idify_dir)
    assert idified is not None

    monkeypatch.setattr(
        nasty.batch.batch_results,
        nasty.batch.batch_results.statuses_lookup.__name__,  # type: ignore
        _mock_statuses_lookup(tweets_truncated),
    )

    # Assert KeyError is propagated, because a Tweet is missing from tweets_truncated.
    with pytest.raises(KeyError):
        idified.unidify(settings.twitter_api, unidify_dir)
    unidified = BatchResults(unidify_dir)
    assert len(batch) > len(unidified)

    monkeypatch.setattr(
        nasty.batch.batch_results,
        nasty.batch.batch_results.statuses_lookup.__name__,  # type: ignore
        _mock_statuses_lookup(tweets),
    )

    unidified = idified.unidify(settings.twitter_api, unidify_dir)
    assert unidified is not None
    assert len(batch) == len(unidified)
    assert tweets == {
        tweet.id: tweet
        for entry in unidified for tweet in unidified.tweets(entry)
    }
Esempio n. 6
0
def test_execute_success_parallel(tmp_path: Path,
                                  monkeypatch: MonkeyPatch) -> None:
    monkeypatch.setenv("NASTY_NUM_WORKERS", "4")
    batch = Batch()
    for i in range(16):
        batch.append(
            Search(
                "trump",
                since=date(2019, 1, i + 1),
                until=date(2019, 1, i + 2),
                max_tweets=50,
            ))
    assert batch.execute(tmp_path)
    _assert_results_dir_structure(tmp_path, list(batch))
Esempio n. 7
0
def _make_batch_results(*,
                        idify_dir: Optional[Path] = None,
                        unidify_dir: Optional[Path] = None) -> BatchResults:
    batch = Batch()
    batch.append(Thread("1115689254271819777"))
    results = batch.execute()
    assert results is not None

    if idify_dir is not None:
        results = results.idify(idify_dir)
        assert results is not None

    if unidify_dir is not None:
        results = results.unidify(unidify_dir)
        assert results is not None

    return results
Esempio n. 8
0
def _make_batch_results(
    settings: NastySettings,
    *,
    idify_dir: Optional[Path] = None,
    unidify_dir: Optional[Path] = None,
) -> BatchResults:
    batch = Batch()
    batch.append(Thread("1115689254271819777"))
    results = batch.execute()
    assert results is not None

    if idify_dir is not None:
        results = results.idify(idify_dir)
        assert results is not None

    if unidify_dir is not None:
        results = results.unidify(settings.twitter_api, unidify_dir)
        assert results is not None

    return results
Esempio n. 9
0
def test_execute_exception_internal_server_error(tmp_path: Path) -> None:
    # Simulate 500 Internal Server Error on first request to Twitter.
    responses.add(
        responses.GET,
        "https://mobile.twitter.com/robots.txt",
        body="Crawl-delay: 1",
    )
    responses.add(
        responses.GET,
        "https://mobile.twitter.com/search",
        match_querystring=False,
        status=HTTPStatus.INTERNAL_SERVER_ERROR.value,
    )

    batch = Batch()
    batch.append(Search("trump", max_tweets=50))
    assert not batch.execute(tmp_path)
    batch_entry = batch[0]
    assert batch_entry == read_json(tmp_path / batch_entry.meta_file_name,
                                    BatchEntry)
    assert batch_entry.exception is not None
    assert batch_entry.exception.type == "UnexpectedStatusCodeException"
Esempio n. 10
0
def test_execute_retrying_after_exception(tmp_path: Path,
                                          caplog: LogCaptureFixture) -> None:
    batch = Batch()
    batch.append(Search("trump", max_tweets=50))

    batch_entry = batch[0]
    exception = _make_json_serialized_exception()
    batch_entry.exception = exception
    meta_file = tmp_path / batch_entry.meta_file_name
    write_json(meta_file, batch_entry)
    batch_entry.exception = None
    meta_stat1 = meta_file.stat()

    caplog.clear()
    assert batch.execute(tmp_path)
    assert 1 == len(  # Assert that log says we are retrying and the previous exception.
        [
            record for record in caplog.records
            if "Retrying" in record.msg and str(exception) in record.msg
        ])

    _assert_results_dir_structure(tmp_path, list(batch))
    meta_stat2 = meta_file.stat()
    assert meta_stat1.st_mtime_ns < meta_stat2.st_mtime_ns
Esempio n. 11
0
 def run(self) -> None:
     batch = Batch()
     batch.load(self.batch_file)
     batch.execute(self.results_dir)