def games(api_version, from_date, to_date): meta = JobMetadata( id=str(uuid.uuid4()), app_version=__version__, execution_date=datetime.utcnow().strftime("%Y/%m/%d"), execution_ts=datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f%z"), query_start_date=from_date.strftime(DATE_FORMATS[0]), query_stop_date=to_date.strftime(DATE_FORMATS[0]), job_successful='True', job_exception='' ) bucket = os.environ.get('DEST_BUCKET', 'output') jobs = os.environ.get('JOB_BUCKET', 'jobs') s3client = boto3.client('s3', config=Config(signature_version='s3v4'), endpoint_url=os.environ.get('S3_ENDPOINT_URL')) storage = Storage(bucket, jobs, s3client) try: api_adapters = API_FACTORY.adapter_for_version(api_version) api = api_adapters.api() crawler = api_adapters.crawler(api, storage) crawler.crawl(from_date, to_date) except Exception as e: click.echo('JOB RUN FAILED') click.echo(e) # if it blows up, update the meta object meta.job_successful = 'False' meta.job_exception = e.__repr__().replace(',', ' ') finally: meta_keys = ','.join(asdict(meta).keys()) meta_values = ','.join(asdict(meta).values()) storage_key = f'{meta.execution_date}/{meta.id}.csv' csv_string = f'{meta_keys}\n{meta_values}' storage.store_job(storage_key, csv_string)
def test_storage_store_job(): s3_mock = Mock() storage = Storage('testbucket', 'jobbucket', s3_mock) result = storage.store_job('1/2/3/4.csv', 'foo bar baz') assert result is True s3_mock.put_object.assert_called_with(Bucket='jobbucket', Key='1/2/3/4.csv', Body='foo bar baz')
def test_storage_store_game(): s3_mock = Mock() storage = Storage('testbucket', 'jobbucket', s3_mock) key = StorageKey('a', 'b', 'c', 'd') result = storage.store_game(key, 'foo bar baz') assert result is True s3_mock.put_object.assert_called_with(Bucket='testbucket', Key='a/b/c/d.csv', Body='foo bar baz')
def test_crawl(schedule_data, game_2019030314_data): databucket = 'testdatabucket' jobbucket = 'testjobbucket' game_1_id = '2019030314' game_2_id = '2019030325' header_string = ','.join(header) schedule = 'https://statsapi.web.nhl.com/api/v1/schedule?startDate=2020-01-01&endDate=2020-01-02' boxscore_1 = f'https://statsapi.web.nhl.com/api/v1/game/{game_1_id}/boxscore' boxscore_2 = f'https://statsapi.web.nhl.com/api/v1/game/{game_2_id}/boxscore' with requests_mock.Mocker() as m: m.get(schedule, json=schedule_data, status_code=200) m.get(boxscore_1, json=game_2019030314_data, status_code=200) m.get(boxscore_2, json=game_2019030314_data, status_code=200) s3_mock = Mock() storage = Storage(databucket, jobbucket, s3_mock) api = NHLApi() crawler = Crawler(api, storage) crawler.crawl(datetime(2020, 1, 1), datetime(2020, 1, 2)) assert s3_mock.put_object.call_count == 2 call_01_kwargs = s3_mock.put_object.call_args_list[0].kwargs call_02_kwargs = s3_mock.put_object.call_args_list[1].kwargs assert call_01_kwargs.get('Bucket') == call_02_kwargs.get( 'Bucket') == databucket assert call_01_kwargs.get('Key') == f'2020/09/13/{game_1_id}.csv' assert call_02_kwargs.get('Key') == f'2020/09/14/{game_2_id}.csv' assert header_string in call_01_kwargs.get('Body') assert header_string in call_02_kwargs.get('Body')
def test_flatten_json(self, transformer_class): storage = Storage(None, None) uuid = 1 transformer = transformer_class(storage, uuid) json = { "a": [ { "b": [ {"c": 1}, {"d": 2}, ], }, { "bb": [ {"cc": 11}, {"dd": 22}, ], }, ], } result = transformer.flatten_json(json) expected = { 'player_a_0_b_0_c': 1, 'player_a_0_b_1_d': 2, 'player_a_1_bb_0_cc': 11, 'player_a_1_bb_1_dd': 22, } assert result == expected
def test_init(self, transformer_class): storage = Storage(None, None) uuid = 1 transformer = transformer_class(storage, uuid) assert transformer.uuid == uuid
def test_init(self, crawler_class): nhl_api = NHLApi() storage = Storage(None, None) uuid = 1 crawler = crawler_class(nhl_api, storage, uuid) assert crawler.uuid == uuid
def test_crawl_no_games(): databucket = 'testdatabucket' jobbucket = 'testjobbucket' schedule = 'https://statsapi.web.nhl.com/api/v1/schedule?startDate=2020-01-01&endDate=2020-01-02' with requests_mock.Mocker() as m: m.get(schedule, json={'totalGames': 0}, status_code=200) s3_mock = Mock() storage = Storage(databucket, jobbucket, s3_mock) api = NHLApi() crawler = Crawler(api, storage) crawler.crawl(datetime(2020, 1, 1), datetime(2020, 1, 2)) assert s3_mock.put_object.call_count == 0