Esempio n. 1
0
def test_execute_success(tmp_path: Path) -> None:
    batch = Batch()
    batch.append(Search("trump", max_tweets=50))
    batch.append(Search("hillary", max_tweets=50))
    batch.append(Search("obama", max_tweets=50))
    assert batch.execute(tmp_path)
    _assert_results_dir_structure(tmp_path, list(batch))
Esempio n. 2
0
def test_lang_de() -> None:
    assert 50 == len(
        list(
            Search("trump",
                   lang="de",
                   max_tweets=50,
                   filter_=SearchFilter.LATEST).request()))
Esempio n. 3
0
def test_query_user_from(user: str) -> None:
    tweets = list(
        Search("from:@" + user, max_tweets=50,
               filter_=SearchFilter.LATEST).request())
    assert 0 < len(tweets) <= 50
    for tweet in tweets:
        assert user.lower() == tweet.user.screen_name.lower()
Esempio n. 4
0
def test_filter_latest() -> None:
    # Check if the 50 latest Tweets about "trump" are from the last 24h. Assumes that
    # each day there are at least 50 Tweets about "trump".
    tweets = list(Search("trump", filter_=SearchFilter.LATEST, max_tweets=50).request())
    assert 50 == len(tweets)
    for tweet in tweets:
        assert datetime.now(timezone.utc) - timedelta(days=1) < tweet.created_at
Esempio n. 5
0
def test_execute_skipping(tmp_path: Path, caplog: LogCaptureFixture) -> None:
    batch_file = tmp_path / "batch.jsonl"
    results_dir = tmp_path / "out"

    # Execute request for the first time.
    batch = Batch()
    batch.append(Search("trump", max_tweets=50))
    batch.dump(batch_file)
    assert batch.execute(results_dir)
    _assert_results_dir_structure(results_dir, list(batch))

    batch_entry = batch[0]
    meta_file = results_dir / batch_entry.meta_file_name
    data_file = results_dir / batch_entry.data_file_name
    meta_stat1 = meta_file.stat()
    data_stat1 = data_file.stat()

    # Execute same request again (should be skipped).
    batch = Batch(
    )  # Recreate from dumped batch file so that batch entry IDs match.
    batch.load(batch_file)
    caplog.clear()
    assert batch.execute(results_dir)
    assert 1 == len(
        [record for record in caplog.records if "Skipping" in record.msg])
    _assert_results_dir_structure(results_dir, list(batch))
    meta_stat2 = meta_file.stat()
    data_stat2 = data_file.stat()

    # Verify that files were not modified.
    assert meta_stat1.st_mtime_ns == meta_stat2.st_mtime_ns
    assert data_stat1.st_mtime_ns == data_stat2.st_mtime_ns
Esempio n. 6
0
def test_execute_stray_data_file(tmp_path: Path,
                                 caplog: LogCaptureFixture) -> None:
    batch = Batch()
    batch.append(Search("trump", max_tweets=50))

    batch_entry = batch[0]
    data = "Just some stray data."
    data_file = tmp_path / batch_entry.data_file_name
    write_file(data_file, data)
    data_stat1 = data_file.stat()

    assert not batch.execute(tmp_path)

    # Assert exception is saved.
    assert batch_entry == read_json(tmp_path / batch_entry.meta_file_name,
                                    BatchEntry)
    assert batch_entry.exception is not None
    assert batch_entry.exception.type == "ValueError"
    batch_entry.exception = None

    # Assert that previous data file is not modified.
    data_stat2 = data_file.stat()
    assert data_stat1.st_mtime == data_stat2.st_mtime
    assert data == read_file(data_file)

    # Delete data file and verify that it works now.
    data_file.unlink()
    caplog.clear()
    assert batch.execute(tmp_path)
    assert 1 == len(
        [record for record in caplog.records if "Retrying" in record.msg])
    _assert_results_dir_structure(tmp_path, list(batch))
Esempio n. 7
0
def test_query_word_or(args: Tuple[str, str]) -> None:
    word1, word2 = args
    tweets = list(Search("{} or {}".format(word1, word2), max_tweets=50).request())
    assert 50 == len(tweets)
    for tweet in tweets:
        all_tweet_text = json.dumps(tweet.to_json()).lower()
        assert word1.lower() in all_tweet_text or word2.lower() in all_tweet_text
Esempio n. 8
0
def test_max_tweets(max_tweets: int) -> None:
    # Using batch_size=100 to speed up these larger requests and since we don't care
    # about accuracy to query here.
    tweets = list(
        Search("trump", max_tweets=max_tweets, batch_size=100).request())
    assert max_tweets == len(tweets)
    assert len(tweets) == len({tweet.id for tweet in tweets})
Esempio n. 9
0
def test_date_range(args: Tuple[date, date]) -> None:
    since, until = args
    tweets = list(
        Search("trump", since=since, until=until, max_tweets=40).request())
    assert 40 == len(tweets)
    for tweet in tweets:
        assert since <= tweet.created_at.date() < until
Esempio n. 10
0
def test_json_conversion_exception() -> None:
    batch_entry = BatchEntry(
        Search("q"),
        id_="id",
        completed_at=None,
        exception=_make_json_serialized_exception(),
    )
    assert batch_entry == batch_entry.from_json(batch_entry.to_json())
Esempio n. 11
0
def test_filter_photos() -> None:
    tweets = list(Search("trump", filter_=SearchFilter.PHOTOS, max_tweets=50).request())
    assert 50 == len(tweets)
    for tweet in tweets:
        json = cast(Any, tweet.json)
        assert len(json["extended_entities"]["media"])
        for medium in json["extended_entities"]["media"]:
            assert medium["type"] in {"photo", "animated_gif"}
Esempio n. 12
0
def test_query_user_to(user: str) -> None:
    tweets = list(Search("to:@" + user, max_tweets=50).request())
    assert 50 == len(tweets)
    for tweet in tweets:
        if not tweet.text.lower().count("@" + user.lower()):
            # Sometimes when a user creates a thread his individual Tweets will not
            # reply to the user, for example:
            # https://twitter.com/_/status/1197499643086753793
            assert user.lower() == tweet.user.screen_name.lower()
Esempio n. 13
0
def test_execute_success_empty(tmp_path: Path) -> None:
    # Random string that currently does not match any Tweet.
    unknown_word = "c9dde8b5451149e683d4f07e4c4348ef"
    batch = Batch()
    batch.append(Search(unknown_word))
    results = batch.execute(tmp_path)
    assert results
    assert not list(results.tweets(results[0]))
    _assert_results_dir_structure(tmp_path, list(batch), allow_empty=True)
Esempio n. 14
0
 def _build_request(self) -> Search:
     return Search(
         self.query,
         since=self.since,
         until=self.until,
         filter_=self.filter_,
         lang=self.lang,
         max_tweets=self.max_tweets,
         batch_size=self.batch_size,
     )
Esempio n. 15
0
def test_query_word_phrase(phrase: str) -> None:
    tweets = list(Search(phrase, max_tweets=50).request())
    assert 50 == len(tweets)
    for tweet in tweets:
        all_tweet_text = json.dumps(tweet.to_json()).lower()
        if phrase.lower() in all_tweet_text:
            # Remove non alphanumeric, see https://stackoverflow.com/a/1277047/211404
            all_tweet_text = re.sub(r"[\W_]+", "", all_tweet_text)
            phrase = re.sub(r"[\W_]+", "", phrase.lower())
            assert phrase in all_tweet_text
Esempio n. 16
0
def test_query_word_and(args: Tuple[str, str]) -> None:
    word1, word2 = args
    tweets = list(
        Search("{} and {}".format(word1, word2),
               filter_=SearchFilter.LATEST,
               max_tweets=50).request())
    assert 50 == len(tweets)
    for tweet in tweets:
        all_tweet_text = json.dumps(tweet.to_json()).lower()
        assert word1.lower() in all_tweet_text
        assert word2.lower() in all_tweet_text
Esempio n. 17
0
def test_execute_success_parallel(tmp_path: Path,
                                  monkeypatch: MonkeyPatch) -> None:
    monkeypatch.setenv("NASTY_NUM_WORKERS", "4")
    batch = Batch()
    for i in range(16):
        batch.append(
            Search(
                "trump",
                since=date(2019, 1, i + 1),
                until=date(2019, 1, i + 2),
                max_tweets=50,
            ))
    assert batch.execute(tmp_path)
    _assert_results_dir_structure(tmp_path, list(batch))
Esempio n. 18
0
def test_query_word_not(args: Tuple[str, str]) -> None:
    word1, word2 = args
    tweets = list(
        Search("{} -{}".format(word1, word2), max_tweets=50).request())
    assert 50 == len(tweets)
    for tweet in tweets:
        all_tweet_text = json.dumps(tweet.to_json()).lower()
        # Sadly, word2 can sometimes still occur in the Text even though we specifically
        # ask Twitter not to. In those cases I do not want to count this case a failure
        # and skip it then.
        assert word1.lower() in all_tweet_text
        if word2.lower() in tweet.text.lower():
            pytest.skip(
                "Negative query word '{}' found in result tweet: {}".format(
                    word2, tweet.to_json()))
Esempio n. 19
0
def test_dump_load_multiple(num_batch_entries: int, tmp_path: Path) -> None:
    batch_file = tmp_path / "batch.jsonl"

    batch = Batch()
    for i in range(1, num_batch_entries + 1):
        batch.append(Search(str(i), max_tweets=i, batch_size=i))
    batch.dump(batch_file)

    lines = list(read_lines_file(batch_file))
    assert num_batch_entries == len(lines)
    for line in lines:
        assert 0 != len(line)

    batch2 = Batch()
    batch2.load(batch_file)
    assert list(batch) == list(batch2)
Esempio n. 20
0
def test_filter_videos() -> None:
    tweets = list(Search("trump", filter_=SearchFilter.VIDEOS, max_tweets=50).request())
    assert 50 == len(tweets)
    for tweet in tweets:
        if "extended_entities" in tweet.json:
            # Video hosted on Twitter.
            json = cast(Any, tweet.json)
            assert len(json["extended_entities"]["media"])
            if "video" in tweet.text.lower():
                # Had one case, where an image post containing the substring
                # "VIDEO: youtu.be/..." matched this query.
                return
            for medium in json["extended_entities"]["media"]:
                assert "video" == medium["type"]
        else:
            # Video hosted  on external platform. AFAIK there is no general way to check
            # whether an URL to an external platform contains a video.
            pass
Esempio n. 21
0
def test_correct_call_to_batch_daily(capsys: CaptureFixture,
                                     tmp_path: Path) -> None:
    batch_file = tmp_path / "batch.jsonl"
    request = Search("trump", since=date(2019, 1, 1), until=date(2019, 2, 1))

    # Needed for type checking.
    assert request.until is not None and request.since is not None

    main(*_make_args(request, to_batch=batch_file, daily=True))

    assert capsys.readouterr().out == ""
    batch = Batch()
    batch.load(batch_file)
    assert len(batch) == (request.until - request.since).days
    for batch_entry, expected_request in zip(batch,
                                             request.to_daily_requests()):
        assert batch_entry.request == expected_request
        assert batch_entry.id
        assert batch_entry.completed_at is None
        assert batch_entry.exception is None
Esempio n. 22
0
def test_execute_exception_internal_server_error(tmp_path: Path) -> None:
    # Simulate 500 Internal Server Error on first request to Twitter.
    responses.add(
        responses.GET,
        "https://mobile.twitter.com/robots.txt",
        body="Crawl-delay: 1",
    )
    responses.add(
        responses.GET,
        "https://mobile.twitter.com/search",
        match_querystring=False,
        status=HTTPStatus.INTERNAL_SERVER_ERROR.value,
    )

    batch = Batch()
    batch.append(Search("trump", max_tweets=50))
    assert not batch.execute(tmp_path)
    batch_entry = batch[0]
    assert batch_entry == read_json(tmp_path / batch_entry.meta_file_name,
                                    BatchEntry)
    assert batch_entry.exception is not None
    assert batch_entry.exception.type == "UnexpectedStatusCodeException"
Esempio n. 23
0
def test_execute_retrying_after_exception(tmp_path: Path,
                                          caplog: LogCaptureFixture) -> None:
    batch = Batch()
    batch.append(Search("trump", max_tweets=50))

    batch_entry = batch[0]
    exception = _make_json_serialized_exception()
    batch_entry.exception = exception
    meta_file = tmp_path / batch_entry.meta_file_name
    write_json(meta_file, batch_entry)
    batch_entry.exception = None
    meta_stat1 = meta_file.stat()

    caplog.clear()
    assert batch.execute(tmp_path)
    assert 1 == len(  # Assert that log says we are retrying and the previous exception.
        [
            record for record in caplog.records
            if "Retrying" in record.msg and str(exception) in record.msg
        ])

    _assert_results_dir_structure(tmp_path, list(batch))
    meta_stat2 = meta_file.stat()
    assert meta_stat1.st_mtime_ns < meta_stat2.st_mtime_ns
Esempio n. 24
0
from typing_extensions import Final

from nasty import main
from nasty.batch.batch import Batch
from nasty.request.replies import Replies
from nasty.request.request import DEFAULT_BATCH_SIZE, DEFAULT_MAX_TWEETS, Request
from nasty.request.search import DEFAULT_FILTER, DEFAULT_LANG, Search, SearchFilter
from nasty.request.thread import Thread

from .mock_context import MockRequestContext

logger = getLogger(__name__)

REQUESTS: Final[Mapping[Type[Request], Sequence[Request]]] = {
    Search: [
        Search("trump"),
        Search("donald trump"),
        Search("trump", since=date(2019, 3, 21), until=date(2019, 3, 22)),
        Search("trump", filter_=SearchFilter.LATEST),
        Search("trump", lang="de"),
        Search("trump", max_tweets=17, batch_size=71),
        Search("trump", max_tweets=None, batch_size=DEFAULT_BATCH_SIZE),
    ],
    Replies: [
        Replies("332308211321425920"),
        Replies("332308211321425920", max_tweets=17, batch_size=71),
        Replies("332308211321425920",
                max_tweets=None,
                batch_size=DEFAULT_BATCH_SIZE),
    ],
    Thread: [
Esempio n. 25
0
def test_query_word_unkown(word: str) -> None:
    assert not list(Search(word).request())
Esempio n. 26
0
def test_query_word_single(word: str) -> None:
    tweets = list(Search(word, max_tweets=50).request())
    assert 50 == len(tweets)
    for tweet in tweets:
        assert word.lower() in json.dumps(tweet.to_json()).lower()
Esempio n. 27
0
def test_special_msg_coronavirus() -> None:
    tweets = list(
        Search("coronavirus", max_tweets=50, filter_=SearchFilter.LATEST).request()
    )
    assert 50 == len(tweets)
Esempio n. 28
0
def test_lang_invalid() -> None:
    assert not list(Search("trump", lang="INVALID", max_tweets=50).request())
Esempio n. 29
0
def test_lang_de() -> None:
    assert 50 == len(list(Search("trump", lang="de", max_tweets=50).request()))
Esempio n. 30
0
def test_filter_top() -> None:
    assert 50 == len(
        list(Search("trump", filter_=SearchFilter.TOP, max_tweets=50).request())
    )