Ejemplo n.º 1
0
def test_execute_success_empty(tmp_path: Path) -> None:
    # Random string that currently does not match any Tweet.
    unknown_word = "c9dde8b5451149e683d4f07e4c4348ef"
    batch = Batch()
    batch.append(Search(unknown_word))
    results = batch.execute(tmp_path)
    assert results
    assert not list(results.tweets(results[0]))
    _assert_results_dir_structure(tmp_path, list(batch), allow_empty=True)
Ejemplo n.º 2
0
def test_execute_success(tmp_path: Path) -> None:
    batch = Batch()
    batch.append(Search("trump", max_tweets=50))
    batch.append(Search("hillary", max_tweets=50))
    batch.append(Search("obama", max_tweets=50))
    assert batch.execute(tmp_path)
    _assert_results_dir_structure(tmp_path, list(batch))
Ejemplo n.º 3
0
def test_unidify_fail_and_restart(
    requests: Iterable[Request],
    settings: NastySettings,
    monkeypatch: MonkeyPatch,
    tmp_path: Path,
) -> None:
    idify_dir = tmp_path / "idify"
    unidify_dir = tmp_path / "unidify"

    batch = Batch()
    for request in requests:
        batch.append(request)
    results = batch.execute()
    assert results is not None

    tweets = {
        tweet.id: tweet
        for entry in results for tweet in results.tweets(entry)
    }
    tweets_truncated = dict(tweets)
    del tweets_truncated[TweetId("1115690615612825601")]

    idified = results.idify(idify_dir)
    assert idified is not None

    monkeypatch.setattr(
        nasty.batch.batch_results,
        nasty.batch.batch_results.statuses_lookup.__name__,  # type: ignore
        _mock_statuses_lookup(tweets_truncated),
    )

    # Assert KeyError is propagated, because a Tweet is missing from tweets_truncated.
    with pytest.raises(KeyError):
        idified.unidify(settings.twitter_api, unidify_dir)
    unidified = BatchResults(unidify_dir)
    assert len(batch) > len(unidified)

    monkeypatch.setattr(
        nasty.batch.batch_results,
        nasty.batch.batch_results.statuses_lookup.__name__,  # type: ignore
        _mock_statuses_lookup(tweets),
    )

    unidified = idified.unidify(settings.twitter_api, unidify_dir)
    assert unidified is not None
    assert len(batch) == len(unidified)
    assert tweets == {
        tweet.id: tweet
        for entry in unidified for tweet in unidified.tweets(entry)
    }
Ejemplo n.º 4
0
def test_execute_success_parallel(tmp_path: Path,
                                  monkeypatch: MonkeyPatch) -> None:
    monkeypatch.setenv("NASTY_NUM_WORKERS", "4")
    batch = Batch()
    for i in range(16):
        batch.append(
            Search(
                "trump",
                since=date(2019, 1, i + 1),
                until=date(2019, 1, i + 2),
                max_tweets=50,
            ))
    assert batch.execute(tmp_path)
    _assert_results_dir_structure(tmp_path, list(batch))
Ejemplo n.º 5
0
def test_dump_load_single(request_: Request, tmp_path: Path) -> None:
    batch_file = tmp_path / "batch.jsonl"

    batch = Batch()
    batch.append(request_)
    batch.dump(batch_file)

    lines = list(read_lines_file(batch_file))
    assert 1 == len(lines)
    assert 0 != len(lines[0])

    batch2 = Batch()
    batch2.load(batch_file)
    assert list(batch) == list(batch2)
Ejemplo n.º 6
0
def test_dump_load_multiple(num_batch_entries: int, tmp_path: Path) -> None:
    batch_file = tmp_path / "batch.jsonl"

    batch = Batch()
    for i in range(1, num_batch_entries + 1):
        batch.append(Search(str(i), max_tweets=i, batch_size=i))
    batch.dump(batch_file)

    lines = list(read_lines_file(batch_file))
    assert num_batch_entries == len(lines)
    for line in lines:
        assert 0 != len(line)

    batch2 = Batch()
    batch2.load(batch_file)
    assert list(batch) == list(batch2)
Ejemplo n.º 7
0
def _make_batch_results(*,
                        idify_dir: Optional[Path] = None,
                        unidify_dir: Optional[Path] = None) -> BatchResults:
    batch = Batch()
    batch.append(Thread("1115689254271819777"))
    results = batch.execute()
    assert results is not None

    if idify_dir is not None:
        results = results.idify(idify_dir)
        assert results is not None

    if unidify_dir is not None:
        results = results.unidify(unidify_dir)
        assert results is not None

    return results
Ejemplo n.º 8
0
def test_correct_call_to_batch(
    request_: Request,
    capsys: CaptureFixture,
    tmp_path: Path,
) -> None:
    batch_file = tmp_path / "batch.jsonl"

    main(*_make_args(request_, to_batch=batch_file))

    assert capsys.readouterr().out == ""
    batch = Batch()
    batch.load(batch_file)
    assert len(batch) == 1
    assert batch[0].request == request_
    assert batch[0].id
    assert batch[0].completed_at is None
    assert batch[0].exception is None
Ejemplo n.º 9
0
def test_correct_call_to_batch_daily(capsys: CaptureFixture,
                                     tmp_path: Path) -> None:
    batch_file = tmp_path / "batch.jsonl"
    request = Search("trump", since=date(2019, 1, 1), until=date(2019, 2, 1))

    # Needed for type checking.
    assert request.until is not None and request.since is not None

    main(*_make_args(request, to_batch=batch_file, daily=True))

    assert capsys.readouterr().out == ""
    batch = Batch()
    batch.load(batch_file)
    assert len(batch) == (request.until - request.since).days
    for batch_entry, expected_request in zip(batch,
                                             request.to_daily_requests()):
        assert batch_entry.request == expected_request
        assert batch_entry.id
        assert batch_entry.completed_at is None
        assert batch_entry.exception is None
Ejemplo n.º 10
0
def _make_batch_results(
    settings: NastySettings,
    *,
    idify_dir: Optional[Path] = None,
    unidify_dir: Optional[Path] = None,
) -> BatchResults:
    batch = Batch()
    batch.append(Thread("1115689254271819777"))
    results = batch.execute()
    assert results is not None

    if idify_dir is not None:
        results = results.idify(idify_dir)
        assert results is not None

    if unidify_dir is not None:
        results = results.unidify(settings.twitter_api, unidify_dir)
        assert results is not None

    return results
Ejemplo n.º 11
0
def test_execute_stray_data_file(tmp_path: Path,
                                 caplog: LogCaptureFixture) -> None:
    batch = Batch()
    batch.append(Search("trump", max_tweets=50))

    batch_entry = batch[0]
    data = "Just some stray data."
    data_file = tmp_path / batch_entry.data_file_name
    write_file(data_file, data)
    data_stat1 = data_file.stat()

    assert not batch.execute(tmp_path)

    # Assert exception is saved.
    assert batch_entry == read_json(tmp_path / batch_entry.meta_file_name,
                                    BatchEntry)
    assert batch_entry.exception is not None
    assert batch_entry.exception.type == "ValueError"
    batch_entry.exception = None

    # Assert that previous data file is not modified.
    data_stat2 = data_file.stat()
    assert data_stat1.st_mtime == data_stat2.st_mtime
    assert data == read_file(data_file)

    # Delete data file and verify that it works now.
    data_file.unlink()
    caplog.clear()
    assert batch.execute(tmp_path)
    assert 1 == len(
        [record for record in caplog.records if "Retrying" in record.msg])
    _assert_results_dir_structure(tmp_path, list(batch))
Ejemplo n.º 12
0
def test_execute_exception_internal_server_error(tmp_path: Path) -> None:
    # Simulate 500 Internal Server Error on first request to Twitter.
    responses.add(
        responses.GET,
        "https://mobile.twitter.com/robots.txt",
        body="Crawl-delay: 1",
    )
    responses.add(
        responses.GET,
        "https://mobile.twitter.com/search",
        match_querystring=False,
        status=HTTPStatus.INTERNAL_SERVER_ERROR.value,
    )

    batch = Batch()
    batch.append(Search("trump", max_tweets=50))
    assert not batch.execute(tmp_path)
    batch_entry = batch[0]
    assert batch_entry == read_json(tmp_path / batch_entry.meta_file_name,
                                    BatchEntry)
    assert batch_entry.exception is not None
    assert batch_entry.exception.type == "UnexpectedStatusCodeException"
Ejemplo n.º 13
0
def test_correct_call_to_batch_exists(
    old_request: Request,
    new_request: Request,
    capsys: CaptureFixture,
    tmp_path: Path,
) -> None:
    batch_file = tmp_path / "batch.jsonl"
    batch = Batch()
    batch.append(old_request)
    batch.dump(batch_file)

    main(*_make_args(new_request, to_batch=batch_file))

    assert capsys.readouterr().out == ""
    batch = Batch()
    batch.load(batch_file)
    assert len(batch) == 2
    for batch_entry, expected_request in zip(batch,
                                             [old_request, new_request]):
        assert batch_entry.request == expected_request
        assert batch_entry.id
        assert batch_entry.completed_at is None
        assert batch_entry.exception is None
Ejemplo n.º 14
0
def test_execute_retrying_after_exception(tmp_path: Path,
                                          caplog: LogCaptureFixture) -> None:
    batch = Batch()
    batch.append(Search("trump", max_tweets=50))

    batch_entry = batch[0]
    exception = _make_json_serialized_exception()
    batch_entry.exception = exception
    meta_file = tmp_path / batch_entry.meta_file_name
    write_json(meta_file, batch_entry)
    batch_entry.exception = None
    meta_stat1 = meta_file.stat()

    caplog.clear()
    assert batch.execute(tmp_path)
    assert 1 == len(  # Assert that log says we are retrying and the previous exception.
        [
            record for record in caplog.records
            if "Retrying" in record.msg and str(exception) in record.msg
        ])

    _assert_results_dir_structure(tmp_path, list(batch))
    meta_stat2 = meta_file.stat()
    assert meta_stat1.st_mtime_ns < meta_stat2.st_mtime_ns
Ejemplo n.º 15
0
 def run(self) -> None:
     request = self._build_request()
     if self.to_batch:
         batch = Batch()
         if self.to_batch.exists():
             batch.load(self.to_batch)
         self._batch_submit(batch, request)
         batch.dump(self.to_batch)
     else:
         for tweet in request.request():
             sys.stdout.write(json.dumps(tweet.to_json()) + "\n")
Ejemplo n.º 16
0
 def run(self) -> None:
     batch = Batch()
     batch.load(self.batch_file)
     batch.execute(self.results_dir)
Ejemplo n.º 17
0
 def _batch_submit(self, batch: Batch, request: Request) -> None:
     batch.append(request)
Ejemplo n.º 18
0
def test_execute_skipping(tmp_path: Path, caplog: LogCaptureFixture) -> None:
    batch_file = tmp_path / "batch.jsonl"
    results_dir = tmp_path / "out"

    # Execute request for the first time.
    batch = Batch()
    batch.append(Search("trump", max_tweets=50))
    batch.dump(batch_file)
    assert batch.execute(results_dir)
    _assert_results_dir_structure(results_dir, list(batch))

    batch_entry = batch[0]
    meta_file = results_dir / batch_entry.meta_file_name
    data_file = results_dir / batch_entry.data_file_name
    meta_stat1 = meta_file.stat()
    data_stat1 = data_file.stat()

    # Execute same request again (should be skipped).
    batch = Batch(
    )  # Recreate from dumped batch file so that batch entry IDs match.
    batch.load(batch_file)
    caplog.clear()
    assert batch.execute(results_dir)
    assert 1 == len(
        [record for record in caplog.records if "Skipping" in record.msg])
    _assert_results_dir_structure(results_dir, list(batch))
    meta_stat2 = meta_file.stat()
    data_stat2 = data_file.stat()

    # Verify that files were not modified.
    assert meta_stat1.st_mtime_ns == meta_stat2.st_mtime_ns
    assert data_stat1.st_mtime_ns == data_stat2.st_mtime_ns