def test_execute_success(tmp_path: Path) -> None: batch = Batch() batch.append(Search("trump", max_tweets=50)) batch.append(Search("hillary", max_tweets=50)) batch.append(Search("obama", max_tweets=50)) assert batch.execute(tmp_path) _assert_results_dir_structure(tmp_path, list(batch))
def test_lang_de() -> None: assert 50 == len( list( Search("trump", lang="de", max_tweets=50, filter_=SearchFilter.LATEST).request()))
def test_query_user_from(user: str) -> None: tweets = list( Search("from:@" + user, max_tweets=50, filter_=SearchFilter.LATEST).request()) assert 0 < len(tweets) <= 50 for tweet in tweets: assert user.lower() == tweet.user.screen_name.lower()
def test_filter_latest() -> None: # Check if the 50 latest Tweets about "trump" are from the last 24h. Assumes that # each day there are at least 50 Tweets about "trump". tweets = list(Search("trump", filter_=SearchFilter.LATEST, max_tweets=50).request()) assert 50 == len(tweets) for tweet in tweets: assert datetime.now(timezone.utc) - timedelta(days=1) < tweet.created_at
def test_execute_skipping(tmp_path: Path, caplog: LogCaptureFixture) -> None: batch_file = tmp_path / "batch.jsonl" results_dir = tmp_path / "out" # Execute request for the first time. batch = Batch() batch.append(Search("trump", max_tweets=50)) batch.dump(batch_file) assert batch.execute(results_dir) _assert_results_dir_structure(results_dir, list(batch)) batch_entry = batch[0] meta_file = results_dir / batch_entry.meta_file_name data_file = results_dir / batch_entry.data_file_name meta_stat1 = meta_file.stat() data_stat1 = data_file.stat() # Execute same request again (should be skipped). batch = Batch( ) # Recreate from dumped batch file so that batch entry IDs match. batch.load(batch_file) caplog.clear() assert batch.execute(results_dir) assert 1 == len( [record for record in caplog.records if "Skipping" in record.msg]) _assert_results_dir_structure(results_dir, list(batch)) meta_stat2 = meta_file.stat() data_stat2 = data_file.stat() # Verify that files were not modified. assert meta_stat1.st_mtime_ns == meta_stat2.st_mtime_ns assert data_stat1.st_mtime_ns == data_stat2.st_mtime_ns
def test_execute_stray_data_file(tmp_path: Path, caplog: LogCaptureFixture) -> None: batch = Batch() batch.append(Search("trump", max_tweets=50)) batch_entry = batch[0] data = "Just some stray data." data_file = tmp_path / batch_entry.data_file_name write_file(data_file, data) data_stat1 = data_file.stat() assert not batch.execute(tmp_path) # Assert exception is saved. assert batch_entry == read_json(tmp_path / batch_entry.meta_file_name, BatchEntry) assert batch_entry.exception is not None assert batch_entry.exception.type == "ValueError" batch_entry.exception = None # Assert that previous data file is not modified. data_stat2 = data_file.stat() assert data_stat1.st_mtime == data_stat2.st_mtime assert data == read_file(data_file) # Delete data file and verify that it works now. data_file.unlink() caplog.clear() assert batch.execute(tmp_path) assert 1 == len( [record for record in caplog.records if "Retrying" in record.msg]) _assert_results_dir_structure(tmp_path, list(batch))
def test_query_word_or(args: Tuple[str, str]) -> None: word1, word2 = args tweets = list(Search("{} or {}".format(word1, word2), max_tweets=50).request()) assert 50 == len(tweets) for tweet in tweets: all_tweet_text = json.dumps(tweet.to_json()).lower() assert word1.lower() in all_tweet_text or word2.lower() in all_tweet_text
def test_max_tweets(max_tweets: int) -> None: # Using batch_size=100 to speed up these larger requests and since we don't care # about accuracy to query here. tweets = list( Search("trump", max_tweets=max_tweets, batch_size=100).request()) assert max_tweets == len(tweets) assert len(tweets) == len({tweet.id for tweet in tweets})
def test_date_range(args: Tuple[date, date]) -> None: since, until = args tweets = list( Search("trump", since=since, until=until, max_tweets=40).request()) assert 40 == len(tweets) for tweet in tweets: assert since <= tweet.created_at.date() < until
def test_json_conversion_exception() -> None: batch_entry = BatchEntry( Search("q"), id_="id", completed_at=None, exception=_make_json_serialized_exception(), ) assert batch_entry == batch_entry.from_json(batch_entry.to_json())
def test_filter_photos() -> None: tweets = list(Search("trump", filter_=SearchFilter.PHOTOS, max_tweets=50).request()) assert 50 == len(tweets) for tweet in tweets: json = cast(Any, tweet.json) assert len(json["extended_entities"]["media"]) for medium in json["extended_entities"]["media"]: assert medium["type"] in {"photo", "animated_gif"}
def test_query_user_to(user: str) -> None: tweets = list(Search("to:@" + user, max_tweets=50).request()) assert 50 == len(tweets) for tweet in tweets: if not tweet.text.lower().count("@" + user.lower()): # Sometimes when a user creates a thread his individual Tweets will not # reply to the user, for example: # https://twitter.com/_/status/1197499643086753793 assert user.lower() == tweet.user.screen_name.lower()
def test_execute_success_empty(tmp_path: Path) -> None: # Random string that currently does not match any Tweet. unknown_word = "c9dde8b5451149e683d4f07e4c4348ef" batch = Batch() batch.append(Search(unknown_word)) results = batch.execute(tmp_path) assert results assert not list(results.tweets(results[0])) _assert_results_dir_structure(tmp_path, list(batch), allow_empty=True)
def _build_request(self) -> Search: return Search( self.query, since=self.since, until=self.until, filter_=self.filter_, lang=self.lang, max_tweets=self.max_tweets, batch_size=self.batch_size, )
def test_query_word_phrase(phrase: str) -> None: tweets = list(Search(phrase, max_tweets=50).request()) assert 50 == len(tweets) for tweet in tweets: all_tweet_text = json.dumps(tweet.to_json()).lower() if phrase.lower() in all_tweet_text: # Remove non alphanumeric, see https://stackoverflow.com/a/1277047/211404 all_tweet_text = re.sub(r"[\W_]+", "", all_tweet_text) phrase = re.sub(r"[\W_]+", "", phrase.lower()) assert phrase in all_tweet_text
def test_query_word_and(args: Tuple[str, str]) -> None: word1, word2 = args tweets = list( Search("{} and {}".format(word1, word2), filter_=SearchFilter.LATEST, max_tweets=50).request()) assert 50 == len(tweets) for tweet in tweets: all_tweet_text = json.dumps(tweet.to_json()).lower() assert word1.lower() in all_tweet_text assert word2.lower() in all_tweet_text
def test_execute_success_parallel(tmp_path: Path, monkeypatch: MonkeyPatch) -> None: monkeypatch.setenv("NASTY_NUM_WORKERS", "4") batch = Batch() for i in range(16): batch.append( Search( "trump", since=date(2019, 1, i + 1), until=date(2019, 1, i + 2), max_tweets=50, )) assert batch.execute(tmp_path) _assert_results_dir_structure(tmp_path, list(batch))
def test_query_word_not(args: Tuple[str, str]) -> None: word1, word2 = args tweets = list( Search("{} -{}".format(word1, word2), max_tweets=50).request()) assert 50 == len(tweets) for tweet in tweets: all_tweet_text = json.dumps(tweet.to_json()).lower() # Sadly, word2 can sometimes still occur in the Text even though we specifically # ask Twitter not to. In those cases I do not want to count this case a failure # and skip it then. assert word1.lower() in all_tweet_text if word2.lower() in tweet.text.lower(): pytest.skip( "Negative query word '{}' found in result tweet: {}".format( word2, tweet.to_json()))
def test_dump_load_multiple(num_batch_entries: int, tmp_path: Path) -> None: batch_file = tmp_path / "batch.jsonl" batch = Batch() for i in range(1, num_batch_entries + 1): batch.append(Search(str(i), max_tweets=i, batch_size=i)) batch.dump(batch_file) lines = list(read_lines_file(batch_file)) assert num_batch_entries == len(lines) for line in lines: assert 0 != len(line) batch2 = Batch() batch2.load(batch_file) assert list(batch) == list(batch2)
def test_filter_videos() -> None: tweets = list(Search("trump", filter_=SearchFilter.VIDEOS, max_tweets=50).request()) assert 50 == len(tweets) for tweet in tweets: if "extended_entities" in tweet.json: # Video hosted on Twitter. json = cast(Any, tweet.json) assert len(json["extended_entities"]["media"]) if "video" in tweet.text.lower(): # Had one case, where an image post containing the substring # "VIDEO: youtu.be/..." matched this query. return for medium in json["extended_entities"]["media"]: assert "video" == medium["type"] else: # Video hosted on external platform. AFAIK there is no general way to check # whether an URL to an external platform contains a video. pass
def test_correct_call_to_batch_daily(capsys: CaptureFixture, tmp_path: Path) -> None: batch_file = tmp_path / "batch.jsonl" request = Search("trump", since=date(2019, 1, 1), until=date(2019, 2, 1)) # Needed for type checking. assert request.until is not None and request.since is not None main(*_make_args(request, to_batch=batch_file, daily=True)) assert capsys.readouterr().out == "" batch = Batch() batch.load(batch_file) assert len(batch) == (request.until - request.since).days for batch_entry, expected_request in zip(batch, request.to_daily_requests()): assert batch_entry.request == expected_request assert batch_entry.id assert batch_entry.completed_at is None assert batch_entry.exception is None
def test_execute_exception_internal_server_error(tmp_path: Path) -> None: # Simulate 500 Internal Server Error on first request to Twitter. responses.add( responses.GET, "https://mobile.twitter.com/robots.txt", body="Crawl-delay: 1", ) responses.add( responses.GET, "https://mobile.twitter.com/search", match_querystring=False, status=HTTPStatus.INTERNAL_SERVER_ERROR.value, ) batch = Batch() batch.append(Search("trump", max_tweets=50)) assert not batch.execute(tmp_path) batch_entry = batch[0] assert batch_entry == read_json(tmp_path / batch_entry.meta_file_name, BatchEntry) assert batch_entry.exception is not None assert batch_entry.exception.type == "UnexpectedStatusCodeException"
def test_execute_retrying_after_exception(tmp_path: Path, caplog: LogCaptureFixture) -> None: batch = Batch() batch.append(Search("trump", max_tweets=50)) batch_entry = batch[0] exception = _make_json_serialized_exception() batch_entry.exception = exception meta_file = tmp_path / batch_entry.meta_file_name write_json(meta_file, batch_entry) batch_entry.exception = None meta_stat1 = meta_file.stat() caplog.clear() assert batch.execute(tmp_path) assert 1 == len( # Assert that log says we are retrying and the previous exception. [ record for record in caplog.records if "Retrying" in record.msg and str(exception) in record.msg ]) _assert_results_dir_structure(tmp_path, list(batch)) meta_stat2 = meta_file.stat() assert meta_stat1.st_mtime_ns < meta_stat2.st_mtime_ns
from typing_extensions import Final from nasty import main from nasty.batch.batch import Batch from nasty.request.replies import Replies from nasty.request.request import DEFAULT_BATCH_SIZE, DEFAULT_MAX_TWEETS, Request from nasty.request.search import DEFAULT_FILTER, DEFAULT_LANG, Search, SearchFilter from nasty.request.thread import Thread from .mock_context import MockRequestContext logger = getLogger(__name__) REQUESTS: Final[Mapping[Type[Request], Sequence[Request]]] = { Search: [ Search("trump"), Search("donald trump"), Search("trump", since=date(2019, 3, 21), until=date(2019, 3, 22)), Search("trump", filter_=SearchFilter.LATEST), Search("trump", lang="de"), Search("trump", max_tweets=17, batch_size=71), Search("trump", max_tweets=None, batch_size=DEFAULT_BATCH_SIZE), ], Replies: [ Replies("332308211321425920"), Replies("332308211321425920", max_tweets=17, batch_size=71), Replies("332308211321425920", max_tweets=None, batch_size=DEFAULT_BATCH_SIZE), ], Thread: [
def test_query_word_unkown(word: str) -> None: assert not list(Search(word).request())
def test_query_word_single(word: str) -> None: tweets = list(Search(word, max_tweets=50).request()) assert 50 == len(tweets) for tweet in tweets: assert word.lower() in json.dumps(tweet.to_json()).lower()
def test_special_msg_coronavirus() -> None: tweets = list( Search("coronavirus", max_tweets=50, filter_=SearchFilter.LATEST).request() ) assert 50 == len(tweets)
def test_lang_invalid() -> None: assert not list(Search("trump", lang="INVALID", max_tweets=50).request())
def test_lang_de() -> None: assert 50 == len(list(Search("trump", lang="de", max_tweets=50).request()))
def test_filter_top() -> None: assert 50 == len( list(Search("trump", filter_=SearchFilter.TOP, max_tweets=50).request()) )