class TestEncoding(object): api = RPApi() @classmethod def setup_class(cls): cls.ds = cls.api.create_dataset( Dataset( name='testing_encoding', filters={ "rp_entity_id": '9BFEB5' # this entity has non-ascii name }, )) params = dict( start_date='2018-05-01 21:51', # we have an event here end_date='2018-05-01 21:52', ) def test_json_iterate(self): self.api.log_curl_commands = True results = self.ds.json(**self.params) assert results, 'We should have some result in the timerange' for analytic_row in results: print(analytic_row) def test_dump_iterate(self): results = self.ds.request_datafile(**self.params) for analytic_row in results: print(analytic_row) @classmethod def teardown_class(cls): cls.ds.delete()
class TestRealtimeFeed(): api = RPApi() @pytest.mark.slow def test_get_something_from_us500(self, dataset='all-granular-data', max_received=5): ds = self.api.get_dataset(dataset_id=dataset) received = 0 for record in ds.request_realtime(): assert isinstance(record, Result) received += 1 logger.info("Got {received}/{max} from us500".format( received=received, max=max_received )) errors = record.is_invalid assert errors is False, 'Record is invalid: %s' % errors if received > max_received: break def test_missing_dataset(self): ds = self.api.get_dataset(dataset_id='missing-dataset') with pytest.raises(APIException) as e: for record in ds.request_realtime(): pass assert e.value.response.status_code == 403
class TestEntityTypeReference(object): api = RPApi() team_reference = api.get_entity_type_reference('team') @pytest.mark.slow def test_save_team_reference(self): """ Get the team reference as a CSV """ team_reference = self.api.get_entity_type_reference('team') f = tempfile.NamedTemporaryFile(prefix='test_reference', delete=False) filepath = f.name team_reference.write_to_file(filepath) with io.open(filepath, encoding='latin-1') as f: lines = f.readlines() assert len(lines) > 100, "We should have several rows" assert lines[0] == 'RP_ENTITY_ID,ENTITY_TYPE,DATA_TYPE,DATA_VALUE,RANGE_START,RANGE_END\n' os.unlink(f.name) def test_team_reference_as_map(self): club = self.team_reference['022568'] assert club.name == 'Olympique de Marseille' def test_team_iterate_entities(self): valid_entities = set() for entity in self.team_reference: last_name = entity.entity_names[-1] if last_name.is_valid(): valid_entities.add(entity.rp_entity_id) assert len(valid_entities) > 100, "We should have several valid teams"
class TestDatasetUpdate(object): """ try to Create a dataset, Read it Update it and Delete it""" api = RPApi() dataset_name = 'testing_ds_update' def test_create_and_update(self): delete_all_datasets_by_name(self.api, self.dataset_name) filters = {"rp_entity_id": {"$in": ['AAAAA']}} dataset = Dataset( name=self.dataset_name, filters=filters, # a dataset with a filter ) dataset = self.api.create_dataset(dataset) assert dataset.id is not None dataset_id = dataset.id # change the dataset new_filters = {"rp_entity_id": {"$in": ['BBBBB']}} dataset.filters = new_filters dataset.save() # get the dataset again dataset = self.api.get_dataset(dataset_id) assert dataset.filters == new_filters new_filters = {"rp_entity_id": {"$in": ['CCCCC']}} dataset.filters = new_filters dataset.save() dataset.delete() assert delete_all_datasets_by_name(self.api, self.dataset_name) == 0
class TestEntityMapping(object): api = RPApi() def test_matching_entity_mapping(self): entities = [{ 'ticker': 'AAPL', 'name': 'Apple Inc.' }, { 'ticker': 'JPM' }, { 'listing': 'XNYS:DVN' }] api = self.api mapping = api.get_entity_mapping(entities) assert not mapping.errors assert len(mapping.matched) == len(mapping.submitted) == 3 # let's get the first mapped entities rp_entity_ids = [match.id for match in mapping.matched] assert rp_entity_ids == ['D8442A', '619882', '14BA06'] def test_mismatch_mapping(self): entities = ["unknown!"] api = self.api mapping = api.get_entity_mapping(entities) rp_entity_ids = [match.id for match in mapping.matched] assert rp_entity_ids == []
class TestDeleteAllByName(object): api = RPApi() base_dataset = Dataset( name='testing_api_delete_all', filters={}, # a dataset without filters ) def test_delete_all_by_name(self): dataset_name = self.base_dataset.name delete_all_datasets_by_name(self.api, dataset_name) assert len(get_datasets_by_name(self.api, dataset_name) ) == 0, "Seems we have datasets that should be deleted" ds1 = self.api.create_dataset(self.base_dataset) # create 1... ds2 = self.api.create_dataset(self.base_dataset) # create 2... assert len(get_datasets_by_name( self.api, dataset_name)) == 2, "We should have just created 2 datasets" # we can also check the new ones are in the owned owned_dataset = self.api.list_datasets() assert ds1 in owned_dataset assert ds2 in owned_dataset delete_all_datasets_by_name(self.api, dataset_name) assert len(get_datasets_by_name(self.api, dataset_name) ) == 0, "Seems we have datasets that should be deleted"
class TestUploadFlow: api = RPApi() def test_quota(self): data = self.api.upload.quota() for field in ('files', 'quota'): assert field in data
class TestDatafile(object): api = RPApi() @pytest.mark.slow @pytest.mark.datafile def test_small_async_download(self): ds = self.api.get_dataset(dataset_id='swiss20') job = ds.request_datafile( start_date='2018-01-01 18:00:00', end_date='2018-01-02 18:00:00', ) assert isinstance(job, Job) with tempfile.NamedTemporaryFile() as fp: job.save_to_file(filename=fp.name) @pytest.mark.slow @pytest.mark.datafile def test_small_async_with_headers(self): ds = self.api.get_dataset(dataset_id='swiss20') job = ds.request_datafile(start_date='2018-01-01 18:00:00', end_date='2018-01-01 18:05:00', fields=['rp_story_id', 'timestamp_utc']) records = [] for record in job.iterate_results(include_headers=True): records.append(record) assert len(records) > 1 assert records[0] == ['RP_STORY_ID', 'TIMESTAMP_UTC'] # we want the headers
class TestDatasetList(object): api = RPApi() def test_list_public_filters(self): # we get name and id for dataset datasets = self.api.list_datasets(scope="public") # iterating it should automatically get the dataset to return the filters for ds in datasets[:3]: assert ds.id and ds.uuid and ds.name and ds.filters
class TestRecentAnalyticsRetried: api = RPApi() def test_upload_delete_retry(self): """ When we delete immediately after creation we get a 404 The API should silently retry for some time """ api = self.api filename = "upload_sample.txt" f = api.upload.file(os.path.join(os.path.dirname(__file__), filename)) f.delete()
class TestDatasetUpdate(object): """ try to Create a dataset, Read it Update it and Delete it""" api = RPApi() dataset_name = 'testing_ds_update' def test_create_and_update(self): delete_all_datasets_by_name(self.api, self.dataset_name) filters = {"rp_entity_id": {"$in": ['AAAAAA']}} dataset = Dataset( name=self.dataset_name, filters=filters, # a dataset with a filter ) dataset = self.api.create_dataset(dataset) assert dataset.id is not None dataset_id = dataset.id # change the dataset new_filters = {"rp_entity_id": {"$in": ['BBBBBB']}} dataset.filters = new_filters dataset.save() # get the dataset again dataset = self.api.get_dataset(dataset_id) assert dataset.filters == new_filters new_filters = {"rp_entity_id": {"$in": ['CCCCCC']}} dataset.filters = new_filters dataset.save() dataset.delete() assert delete_all_datasets_by_name(self.api, self.dataset_name) == 0 def test_simple_update(self): filters = {"rp_entity_id": {"$in": ['D8442A']}} ds = self.api.create_dataset( Dataset( name=self.dataset_name, filters=filters, # a dataset with a filter ) ) assert ds._lazy_retrieve_on_get is False dataset_id = ds.id ds = self.api.get_dataset(dataset_id) # retrieve the dataset assert ds._lazy_retrieve_on_get is True # it still have to be lazy loaded here ds.filters = {"rp_entity_id": {"$in": ["228D42"]}} # update the dataset *** ds.save() for r in ds.json('2019-01-01', '2019-01-02'): assert r['rp_entity_id'] == '228D42', "Expecting entity to be 228D42 - got %s" % r['rp_entity_id'] break
class TestDatafile(object): api = RPApi() def test_premium_url(self): premium_story_id = 'B5461869942657A8D4956BE409DEC944' url = self.api.get_document_url(premium_story_id) assert "ravenpack.com" in url def test_nonpremium_url(self): premium_story_id = '691D5D416F8E9752DDD9C2F8C30FBE53' url = self.api.get_document_url(premium_story_id) assert 'https://www.india.com/' in url
class TestDatasetRetrieval(object): api = RPApi() def test_get_dataset(self): dataset_id = 'us30' ds_by_id = Dataset(api=self.api, id=dataset_id) filters = ds_by_id.filters assert isinstance(filters, dict) ds_via_api = self.api.get_dataset(dataset_id) ds_by_uuid = Dataset(api=self.api, uuid=dataset_id) assert ds_via_api.filters == ds_by_id.filters == ds_by_uuid.filters
class TestDatafile(object): api = RPApi() @pytest.mark.slow @pytest.mark.datafile def test_small_async_download(self): ds = self.api.get_dataset(dataset_id='swiss20') job = ds.request_datafile( start_date='2018-01-01 18:00:00', end_date='2018-01-02 18:00:00', ) assert isinstance(job, Job) with tempfile.NamedTemporaryFile() as fp: job.save_to_file(filename=fp.name)
class TestEntityMapping(object): api = RPApi() def test_matching_entity_mapping(self): entities = [{'ticker': 'AAPL', 'name': 'Apple Inc.'}, {'ticker': 'JPM'}, {'listing': 'XNYS:DVN'}] mapping = self.api.get_entity_mapping(entities) assert not mapping.errors assert len(mapping.matched) == len(mapping.submitted) == 3 # let's get the first mapped entities rp_entity_ids = [match.id for match in mapping.matched] assert rp_entity_ids == ['D8442A', '619882', '14BA06'] def test_mismatch_mapping(self): entities = ["unknown!"] mapping = self.api.get_entity_mapping(entities) rp_entity_ids = [match.id for match in mapping.matched] assert rp_entity_ids == [] def test_mapping_example(self): invalid_entity_request = "Unknown entity specified" universe = [ "RavenPack", {'ticker': 'AAPL'}, 'California USA', { # Amazon, specifying various fields "client_id": "12345-A", "date": "2017-01-01", "name": "Amazon Inc.", "entity_type": "COMP", "isin": "US0231351067", "cusip": "023135106", "sedol": "B58WM62", "listing": "XNAS:AMZN" }, invalid_entity_request ] mapping = self.api.get_entity_mapping(universe) assert len(mapping.matched) == 4 assert [m.name for m in mapping.matched] == [ "RavenPack International S.L.", "Apple Inc.", "California, U.S.", "Amazon.com Inc." ] assert len(mapping.errors) == 1 assert mapping.errors[0].request == invalid_entity_request
class TestEntityReference(object): api = RPApi() def test_apple(self): reference = self.api.get_entity_reference(APPLE_RP_ENTITY_ID) assert reference.rp_entity_id == APPLE_RP_ENTITY_ID assert reference.names[-1].value == reference.name == 'APPLE INC.' assert reference.tickers[-1].value == 'AAPL' def test_failing(self): try: missing = self.api.get_entity_reference('invalid') assert False, "Invalid entity should raise an Exception" except APIException: pass
class TestAdHocJson(object): api = RPApi() def test_small_adhoc(self): data = self.api.json( start_date='2018-01-01 18:00:00', end_date='2018-01-01 18:05:00', fields=['timestamp_utc', 'rp_entity_id', 'headline'], filters={ "entity_type": { "$in": ['PROD'] }, # "entity_type": "PROD", }) assert isinstance(data, Results) assert len(data) > 0, 'We should have some product in those 5 minutes'
class TestDatasetJson(object): api = RPApi() def test_known_swiss(self): ds = self.api.get_dataset(dataset_id='swiss20') data = ds.json( start_date='2018-01-01 18:00:00', end_date='2018-01-02 18:00:00', ) assert isinstance(data, Results) assert len(data) > 500, 'We should have more data in 1 day of swiss20' def test_indicator_dataset(self): indicator_dataset = Dataset( name='Test-indicator-dataset', filters={"$and": [{"rp_entity_id": {"$in": ["D8442A"]}}]}, fields=[{"average": {"avg": {"field": "EVENT_SENTIMENT_SCORE"}}}], frequency='daily', ) indicator_dataset = self.api.create_dataset(indicator_dataset) try: # ask the indicator dataset for its data response = indicator_dataset.json('2018-01-01 00:00', '2018-01-02 00:00') assert len(response) == 2 # we should get 2 rows assert {r['rp_entity_id'] for r in response} == {'D8442A', 'ROLLUP'} # do a request overriding fields and frequency to see the underlying data response = indicator_dataset.json('2018-01-01 00:00', '2018-01-02 00:00', fields=['rp_story_id', 'rp_entity_id'], frequency='granular') assert len(response) > 200, "We should have many granular analytics rows" assert {r['rp_entity_id'] for r in response} == {'D8442A'}, "All rows should be D8442A" finally: indicator_dataset.delete() def test_granular_dataset(self): self.api.log_curl_commands = True granular_dataset = Dataset( name='Test-granular-dataset', filters={"$and": [{"rp_entity_id": {"$in": ["D8442A"]}}, {"relevance": 90}]}, ) granular_dataset = self.api.create_dataset(granular_dataset) try: granular_dataset.json('2018-01-01 00:00', '2018-01-02 00:00') finally: granular_dataset.delete()
class TestDatasetCount(object): api = RPApi() @pytest.mark.json def test_count_timezone(self): ds = self.api.get_dataset(dataset_id='us30') count_results_utc = ds.count( start_date="2019-05-14", end_date="2019-05-15", ) assert isinstance(count_results_utc, dict) count_results_london = ds.count(start_date="2019-05-14", end_date="2019-05-15", time_zone="Europe/London") assert isinstance(count_results_london, dict) assert count_results_london != count_results_utc
class TestJobCancellation(object): api = RPApi() ds = None @classmethod def setup_class(cls): cls.ds = cls.api.create_dataset( Dataset( name='test_job_cancel', filters={ "rp_entity_id": 'D8442A' }, ) ) def test_job_cancel(self): params = dict( start_date='2018-05-10 21:51', # we have an event here end_date='2018-05-10 21:52', ) job = self.ds.request_datafile( **params ) status = job.get_status() try: job.cancel() except APIException as exception: # cancel raised an exception, means that we were already processing it assert status == 'processing' assert exception.response.status_code == 400 else: assert status == 'enqueued' assert job.get_status() == 'cancelled' assert job.is_processing is False with pytest.raises(JobNotProcessing): job.wait_for_completion() @classmethod def teardown_class(cls): cls.ds.delete()
class TestDatasetCRUD(object): """ try to Create a dataset, Read it Update it and Delete it""" api = RPApi() dataset_name = 'testing_api_crud' def test_get_public_dataset_list(self): datasets = self.api.list_datasets(scope='public') assert 'us30' in datasets, 'US30 should be in public datasets' assert len(datasets) > 100, 'We expect at least 100 public RavenPack datasets' def test_get_private_dataset_list(self): datasets = self.api.list_datasets() assert len(datasets) > 0, "Don't you have a dataset?" def test_create_and_delete(self): # the test dataset is already there, let's delete it first # we can have multiple dataset with same name, deleting all of them delete_all_datasets_by_name(self.api, self.dataset_name) # create the dataset filters = {"rp_entity_id": {"$in": ['D8442A']}} dataset = Dataset( name=self.dataset_name, filters=filters, # a dataset with a filter ) new_dataset = self.api.create_dataset( dataset ) assert new_dataset.filters == dataset.filters, "Created dataset filters are not as expected" assert new_dataset.id is not None, "We should have a dataset id" owned_dataset = self.api.list_datasets() assert new_dataset.id in owned_dataset, "We should own the new dataset" new_dataset.delete() owned_dataset = self.api.list_datasets() assert new_dataset.id not in owned_dataset, "The new dataset should be deleted"
# Download the historical compressed flat files (with all entities or just companies) # they are decompressed and combined into a single csv file per year import os import zipfile from ravenpackapi import RPApi from ravenpackapi.util import parse_csv_line api_key = os.environ['RP_API_KEY'] # set your API KEY here api = RPApi(api_key) flat_type = 'companies' # can be 'companies' or 'full' flat_list = api.get_flatfile_list(flat_type) for flat_file in flat_list: file_id = flat_file['id'] combined_year_filename = '%s.combined.csv' % file_id if not os.path.isfile(combined_year_filename): with open(combined_year_filename, 'wb') as output: headers_written = False with api.get_flatfile(flat_type, file_id) as flatzip: if not os.path.isfile(file_id): print("Downloading", file_id, flat_file['size'] / 1024 / 1024, "MB") with open(file_id, 'wb') as f: for chunk in flatzip.iter_content(chunk_size=8192): f.write(chunk) with zipfile.ZipFile(file_id) as zipped: for fileinfo in zipped.namelist(): print(fileinfo) with zipped.open(fileinfo) as csv: header_line = next(csv)
from ravenpackapi import RPApi import logging logging.basicConfig(level=logging.DEBUG) # initialize the API (here we use the RP_API_KEY in os.environ) api = RPApi() # query the json endpoint for a dataset *** # use the public dataset with id 'us30' ds = api.get_dataset(dataset_id='us30') # query the dataset analytics with the json endpoint print(ds) data = ds.json( start_date='2018-01-05 18:00:00', end_date='2018-01-05 18:01:00', ) for record in data: print(record) # query the ad-hoc json endpoint *** adhoc_data = api.json( start_date='2018-01-05 18:00:00', end_date='2018-01-05 18:01:00', fields=ds.fields, filters=ds.filters, ) print(adhoc_data) for record in adhoc_data: print(record)
class TestConditions(object): api = RPApi() ds = None @classmethod def setup_class(cls): cls.ds = cls.api.create_dataset( Dataset.from_dict({ "name": "Test custom dataset", "fields": ["timestamp_utc", "rp_entity_id", "entity_name", "AVG_REL"], "filters": { "relevance": { "$gte": 90 } }, "custom_fields": [{ "AVG_REL": { "avg": { "field": "RELEVANCE", "mode": "daily" } } }], "conditions": { "$and": [{ "AVG_REL": { "$gt": 30 } }, { "rp_entity_id": { "$in": ["ROLLUP"] } }] }, "frequency": "daily", "tags": [] })) def test_dataset_copy_updated(self): source_dataset = Dataset(api=self.api, id='us30') new_dataset = Dataset( api=self.api, name="copy of the us30 dataset", filters=source_dataset.filters, fields=['timestamp_utc', 'rp_entity_id', 'avg_sentiment'], custom_fields=[{ "avg_sentiment": { "avg": { "field": "EVENT_SENTIMENT_SCORE", } } }], frequency='daily', tags=['copy', 'test']) new_dataset.save() new_dataset.delete() @classmethod def teardown_class(cls): cls.ds.delete()
import pickle import numpy as np import pandas as pd import torch from ravenpackapi import RPApi # we temporarily cannot share the api key api = RPApi(api_key='') def read_data(args): path = args.path + args.region + '/' + args.sector + '/' batch = args.batch_size with open(path + "input_ef.pkl", "rb") as f: input_ef = np.array(pickle.load(f)) if args.mode == 'price_spike': with open(path + "input_y_price.pkl", "rb") as f: label_y = np.array(pickle.load(f)) elif args.mode == 'volume_spike': with open(path + "input_y_volume.pkl", "rb") as f: label_y = np.array(pickle.load(f)) else: raise ValueError with open(path + "companies_list.pkl", 'rb') as f: company_list = pickle.load(f) with open(path + "input_et.pkl", "rb") as f: input_et = np.array(pickle.load(f)) with open(path + "input_pt.pkl", "rb") as f: input_pt = np.array(pickle.load(f))
class TestConditions(object): api = RPApi() ds = None @classmethod def setup_class(cls): cls.ds = cls.api.create_dataset(Dataset.from_dict( { "name": "Test custom fields", "product": "rpa", "product_version": "1.0", "fields": [ "timestamp_utc", "rp_entity_id", "entity_name", "AVG_REL" ], "filters": { }, "custom_fields": [ { "AVG_REL": { "avg": { "field": "RELEVANCE", "mode": "daily" } } } ], "conditions": { "$and": [ { "AVG_REL": { "$gt": 30 } }, { "rp_entity_id": { "$in": [ "ROLLUP" ] } } ] }, "frequency": "daily", "tags": [] } )) def test_custom_fields_and_conditions(self): self.api.log_curl_commands = True ds = self.ds assert ds.frequency == 'daily' dataset_id = ds.id assert dataset_id is not None, "Dataset should be saved" data = ds.json('2019-05-01', '2019-05-02') assert len(data) == 1 record = next(iter(data)) assert record['rp_entity_id'] == "ROLLUP" assert record['avg_rel'] > 30 @classmethod def teardown_class(cls): cls.ds.delete()
from pprint import pprint from microprediction import MicroWriter from microprediction.config_private import COVID_API, COVID_UUID, TRAFFIC_WRITE_KEY # New video tutorials are available at https://www.microprediction.com/python-1 to help you # get started creating streams (see the 4th module in particular) # This might be broken from ravenpackapi import RPApi, ApiConnectionError logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) # initialize the API (here we use the RP_API_KEY in os.environ) api = RPApi(api_key=COVID_API) # query the realtime feed ds = api.get_dataset(dataset_id=COVID_UUID) def wait_between_attempts(): """ Incremental backoff between connection attempts """ wait_time = 19.3 # time is in seconds while True: yield wait_time wait_time = min(wait_time * 1.5, 30) wait_time *= (100 + random.randint(0, 50)) / 100 wait_time = wait_between_attempts()
from ravenpackapi import RPApi from ravenpackapi import Dataset import pandas as pd apikey = "**********************" api = RPApi(api_key=apikey) #Extracts data (positive news count) from one entity def get_counts(entity_id, ltgt, start_date, end_date, filename): label = "count_pos" if ltgt == "lt": label = "count_neg" global api custom_dataset = Dataset(name="Test set", frequency="daily", filters={ "and": [ { "rp_entity_id": entity_id }, { "event_relevance": { "gte": 90 } }, { "event_sentiment_score": {
print('JSON - OK') results['json'] = True def check_realtime(): print("Realtime ...") for record in ds.request_realtime(): assert isinstance(record, Result) break print('Realtime - OK') results['realtime'] = True if __name__ == '__main__': args = parser.parse_args() api = RPApi(args.key) ds = api.get_dataset('all-granular-data') date_end = datetime.datetime.utcnow() - datetime.timedelta(minutes=60) date_start = date_end - datetime.timedelta(minutes=3) # 3 minutes of data if not api.api_key: print("Please provide an APIKEY: with the --key parameter or setting the RP_API_KEY environment variable") exit(1) print("Checking connection with APIKEY: %s" % api.api_key) checks = [ threading.Thread(target=check_datafile), threading.Thread(target=check_json), threading.Thread(target=check_realtime), ]
""" Download all data from the chosen dataset in a time range Download files are compressed, and chunked per year """ import os from ravenpackapi import RPApi from ravenpackapi.util import time_intervals, SPLIT_WEEKLY api = RPApi(api_key='YOUR_API_KEY') ds = api.get_dataset('YOUR_DATASET_ID') start_date = '2018-01-01' end_date = '2018-01-10' GET_COMPRESSED = True output_folder = './output' os.makedirs(output_folder, exist_ok=True) # create folder for output for range_start, range_end in time_intervals( start_date, end_date, split=SPLIT_WEEKLY, # available splits: # SPLIT_YEARLY, SPLIT_WEEKLY, SPLIT_DAILY # or SPLIT_MONTHLY (the default) ): job = ds.request_datafile(