def test_execute_skipping(tmp_path: Path, caplog: LogCaptureFixture) -> None: batch_file = tmp_path / "batch.jsonl" results_dir = tmp_path / "out" # Execute request for the first time. batch = Batch() batch.append(Search("trump", max_tweets=50)) batch.dump(batch_file) assert batch.execute(results_dir) _assert_results_dir_structure(results_dir, list(batch)) batch_entry = batch[0] meta_file = results_dir / batch_entry.meta_file_name data_file = results_dir / batch_entry.data_file_name meta_stat1 = meta_file.stat() data_stat1 = data_file.stat() # Execute same request again (should be skipped). batch = Batch( ) # Recreate from dumped batch file so that batch entry IDs match. batch.load(batch_file) caplog.clear() assert batch.execute(results_dir) assert 1 == len( [record for record in caplog.records if "Skipping" in record.msg]) _assert_results_dir_structure(results_dir, list(batch)) meta_stat2 = meta_file.stat() data_stat2 = data_file.stat() # Verify that files were not modified. assert meta_stat1.st_mtime_ns == meta_stat2.st_mtime_ns assert data_stat1.st_mtime_ns == data_stat2.st_mtime_ns
def test_execute_stray_data_file(tmp_path: Path, caplog: LogCaptureFixture) -> None: batch = Batch() batch.append(Search("trump", max_tweets=50)) batch_entry = batch[0] data = "Just some stray data." data_file = tmp_path / batch_entry.data_file_name write_file(data_file, data) data_stat1 = data_file.stat() assert not batch.execute(tmp_path) # Assert exception is saved. assert batch_entry == read_json(tmp_path / batch_entry.meta_file_name, BatchEntry) assert batch_entry.exception is not None assert batch_entry.exception.type == "ValueError" batch_entry.exception = None # Assert that previous data file is not modified. data_stat2 = data_file.stat() assert data_stat1.st_mtime == data_stat2.st_mtime assert data == read_file(data_file) # Delete data file and verify that it works now. data_file.unlink() caplog.clear() assert batch.execute(tmp_path) assert 1 == len( [record for record in caplog.records if "Retrying" in record.msg]) _assert_results_dir_structure(tmp_path, list(batch))
def test_execute_success(tmp_path: Path) -> None: batch = Batch() batch.append(Search("trump", max_tweets=50)) batch.append(Search("hillary", max_tweets=50)) batch.append(Search("obama", max_tweets=50)) assert batch.execute(tmp_path) _assert_results_dir_structure(tmp_path, list(batch))
def test_execute_success_empty(tmp_path: Path) -> None: # Random string that currently does not match any Tweet. unknown_word = "c9dde8b5451149e683d4f07e4c4348ef" batch = Batch() batch.append(Search(unknown_word)) results = batch.execute(tmp_path) assert results assert not list(results.tweets(results[0])) _assert_results_dir_structure(tmp_path, list(batch), allow_empty=True)
def test_unidify_fail_and_restart( requests: Iterable[Request], settings: NastySettings, monkeypatch: MonkeyPatch, tmp_path: Path, ) -> None: idify_dir = tmp_path / "idify" unidify_dir = tmp_path / "unidify" batch = Batch() for request in requests: batch.append(request) results = batch.execute() assert results is not None tweets = { tweet.id: tweet for entry in results for tweet in results.tweets(entry) } tweets_truncated = dict(tweets) del tweets_truncated[TweetId("1115690615612825601")] idified = results.idify(idify_dir) assert idified is not None monkeypatch.setattr( nasty.batch.batch_results, nasty.batch.batch_results.statuses_lookup.__name__, # type: ignore _mock_statuses_lookup(tweets_truncated), ) # Assert KeyError is propagated, because a Tweet is missing from tweets_truncated. with pytest.raises(KeyError): idified.unidify(settings.twitter_api, unidify_dir) unidified = BatchResults(unidify_dir) assert len(batch) > len(unidified) monkeypatch.setattr( nasty.batch.batch_results, nasty.batch.batch_results.statuses_lookup.__name__, # type: ignore _mock_statuses_lookup(tweets), ) unidified = idified.unidify(settings.twitter_api, unidify_dir) assert unidified is not None assert len(batch) == len(unidified) assert tweets == { tweet.id: tweet for entry in unidified for tweet in unidified.tweets(entry) }
def test_execute_success_parallel(tmp_path: Path, monkeypatch: MonkeyPatch) -> None: monkeypatch.setenv("NASTY_NUM_WORKERS", "4") batch = Batch() for i in range(16): batch.append( Search( "trump", since=date(2019, 1, i + 1), until=date(2019, 1, i + 2), max_tweets=50, )) assert batch.execute(tmp_path) _assert_results_dir_structure(tmp_path, list(batch))
def _make_batch_results(*, idify_dir: Optional[Path] = None, unidify_dir: Optional[Path] = None) -> BatchResults: batch = Batch() batch.append(Thread("1115689254271819777")) results = batch.execute() assert results is not None if idify_dir is not None: results = results.idify(idify_dir) assert results is not None if unidify_dir is not None: results = results.unidify(unidify_dir) assert results is not None return results
def _make_batch_results( settings: NastySettings, *, idify_dir: Optional[Path] = None, unidify_dir: Optional[Path] = None, ) -> BatchResults: batch = Batch() batch.append(Thread("1115689254271819777")) results = batch.execute() assert results is not None if idify_dir is not None: results = results.idify(idify_dir) assert results is not None if unidify_dir is not None: results = results.unidify(settings.twitter_api, unidify_dir) assert results is not None return results
def test_execute_exception_internal_server_error(tmp_path: Path) -> None: # Simulate 500 Internal Server Error on first request to Twitter. responses.add( responses.GET, "https://mobile.twitter.com/robots.txt", body="Crawl-delay: 1", ) responses.add( responses.GET, "https://mobile.twitter.com/search", match_querystring=False, status=HTTPStatus.INTERNAL_SERVER_ERROR.value, ) batch = Batch() batch.append(Search("trump", max_tweets=50)) assert not batch.execute(tmp_path) batch_entry = batch[0] assert batch_entry == read_json(tmp_path / batch_entry.meta_file_name, BatchEntry) assert batch_entry.exception is not None assert batch_entry.exception.type == "UnexpectedStatusCodeException"
def test_execute_retrying_after_exception(tmp_path: Path, caplog: LogCaptureFixture) -> None: batch = Batch() batch.append(Search("trump", max_tweets=50)) batch_entry = batch[0] exception = _make_json_serialized_exception() batch_entry.exception = exception meta_file = tmp_path / batch_entry.meta_file_name write_json(meta_file, batch_entry) batch_entry.exception = None meta_stat1 = meta_file.stat() caplog.clear() assert batch.execute(tmp_path) assert 1 == len( # Assert that log says we are retrying and the previous exception. [ record for record in caplog.records if "Retrying" in record.msg and str(exception) in record.msg ]) _assert_results_dir_structure(tmp_path, list(batch)) meta_stat2 = meta_file.stat() assert meta_stat1.st_mtime_ns < meta_stat2.st_mtime_ns
def run(self) -> None: batch = Batch() batch.load(self.batch_file) batch.execute(self.results_dir)