def run(self): # planner = APSPlanner() planner = SchedulePlanner(refresh_interval=0.5) http_inlet = HttpInlet('https://jsonplaceholder.typicode.com/todos/1', metadata={CsvOutlet.FILE_MODE: 'a'}) file_inlet = FileInlet('output_03.csv', read_mode=FileInletMode.LINE) http_inlet2 = HttpInlet( 'https://postman-echo.com/get?foo1=bar1&foo2=bar2', metadata={ 'MONGODB_COLLECTION': 'test_collection2', 'csv_file': 'output_02.csv' }) print_outlet = PrintOutlet(only_payload=True) mongo_outlet = MongoOutlet('databay', 'test_collection') csv_outlet = CsvOutlet('output_03.csv') planner.add_links( Link([file_inlet], [print_outlet], timedelta(seconds=0.5))) planner.add_links( Link([http_inlet, http_inlet, http_inlet], [csv_outlet], timedelta(seconds=2))) # planner.add_links(Link([http_inlet], [mongo_outlet], timedelta(seconds=1), name='first')) # planner.add_links(Link([http_inlet2, http_inlet2, http_inlet2], [mongo_outlet], timedelta(seconds=5), name='second')) # planner.add_links(Link([], [], timedelta(seconds=1.5))) # planner.add_links(Link([alphavantage_inlet], [mongo_outlet], timedelta(seconds=5))) # planner.add_links(Link([iex_inlet], [mongo_outlet], timedelta(seconds=5))) planner.start()
def _unschedule(self, link: Link): """ Unschedule a link. :type link: :any:`Link` :param link: Link to be unscheduled """ if link.job is not None: link.job.remove() link.set_job(None)
def _schedule(self, link: Link): """ Schedule a link. Sets :any:`APS Job <apscheduler.job.Job>` as this link's job. :type link: :any:`Link` :param link: Link to be scheduled """ job = self._scheduler.add_job( link.transfer, trigger=IntervalTrigger(seconds=link.interval.total_seconds())) link.set_job(job)
def test_flush(self, inlet, outlet): buffer = Buffer(count_threshold=100, time_threshold=10) payload = [1, 2, 3, 4] records = [Record(payload=p) for p in payload] link = Link(inlet, outlet, interval=1, processors=buffer, copy_records=False) inlet._pull = pull_mock(records[:2]) link.transfer() outlet._push.assert_called_with([], mock.ANY) # no records yet inlet._pull = pull_mock(records[2:]) link.transfer() outlet._push.assert_called_with([], mock.ANY) # no records yet buffer.flush = True inlet._pull = pull_mock([]) link.transfer() outlet._push.assert_called_with( records, mock.ANY) # all records should be flushed
def test_buffer_time(self, inlet, outlet): buffer = Buffer(time_threshold=0.02) payload = [1, 2, 3, 4] records = [Record(payload=p) for p in payload] link = Link(inlet, outlet, interval=1, processors=buffer, copy_records=False) inlet._pull = pull_mock(records[:2]) link.transfer() outlet._push.assert_called_with( [], mock.ANY) # not enough time have passed inlet._pull = pull_mock(records[2:]) link.transfer() outlet._push.assert_called_with( [], mock.ANY) # not enough time have passed time.sleep(0.02) inlet._pull = pull_mock([]) link.transfer() outlet._push.assert_called_with( records, mock.ANY) # all records should be returned here
def _schedule(self, link: Link): """ Schedule a link, setting a :class:`schedule.Job` as this link's job. :type link: :any:`Link` :param link: Link to be scheduled :raises: :class:`ScheduleIntervalError` if link's interval is smaller than the :any:`refresh interval`. """ if link.interval.total_seconds() < self._refresh_interval: raise ScheduleIntervalError( f'Link interval must be greater than or equal to refresh interval. Link interval: {link.interval.total_seconds()}s, Refresh interval: {self._refresh_interval}s' ) job = schedule.every(link.interval.total_seconds()).seconds.do( self._run_job, link) link.set_job(job)
def test_flush_after_shutdown(self, inlet, outlet): buffer = Buffer(count_threshold=100, time_threshold=10) counter_dict = {'counter': 0, 'records': []} link = Link(inlet, outlet, interval=0.01, processors=buffer, copy_records=False) planner = SchedulePlanner(link, refresh_interval=0.01) async def pull_coro(_): counter_dict['counter'] += 1 record = Record(payload=counter_dict['counter']) counter_dict['records'].append(record) return [record] mock_pull = MagicMock(side_effect=pull_coro) inlet._pull = mock_pull th = Thread(target=planner.start, daemon=True) th.start() time.sleep(0.1) planner.shutdown() th.join() calls = outlet._push.call_args_list for c in calls: self.assertEqual(c(), [], 'Should only contain empty record lists.') self.assertEqual(buffer.records, counter_dict['records'], 'All records should be stored in the buffer') planner.force_transfer() self.assertEqual(outlet._push.call_args(), [], 'Should return empty record list') buffer.flush = True planner.force_transfer() self.assertEqual(outlet._push.call_args[0][0], counter_dict['records'], 'Should return all records')
def test_buffer_count(self, inlet, outlet): buffer = Buffer(count_threshold=3) payload = [1, 2, 3, 4] records = [Record(payload=p) for p in payload] link = Link(inlet, outlet, interval=1, processors=buffer, copy_records=False) inlet._pull = pull_mock(records[:2]) link.transfer() outlet._push.assert_called_with( [], mock.ANY) # after first call we shouldn't have any records inlet._pull = pull_mock(records[2:]) link.transfer() outlet._push.assert_called_with( records, mock.ANY) # all records should be returned here
class DummyTextInlet(Inlet): "A simple `Inlet` that randomly pulls a string from a list of strings." def __init__(self, text: list, *args, **kwargs): super().__init__(*args, **kwargs) self.text = text self._id = 0 def pull(self, update): text_selection = random.choice(self.text) self._id += 1 time.sleep(1) return {self._id: text_selection} _LOGGER.setLevel(logging.INFO) es_client = elasticsearch.Elasticsearch(timeout=30) text_inlet = DummyTextInlet(TEXT.split(".")) elasticsearch_outlet = ElasticsearchIndexerOutlet(es_client, "my-test-index") link = Link(text_inlet, elasticsearch_outlet, interval=2, tags='elasticsearch_outlet') planner = ApsPlanner(link) planner.start()
class PrintOutlet(Outlet): async def push(self, records:[Record], update): _LOGGER.debug(f'{update} push starts') # create an asynchronous task for each record tasks = [self.print_task(record, update) for record in records] # await all print tasks await asyncio.gather(*tasks) async def print_task(self, record, update): # simulate a long-taking operation await asyncio.sleep(0.5) # execute _LOGGER.debug(f'{update} consumed:{record.payload}') random_int_inletA = RandomIntInlet() random_int_inletB = RandomIntInlet() random_int_inletC = RandomIntInlet() print_outlet = PrintOutlet() link = Link([random_int_inletA, random_int_inletB, random_int_inletC], print_outlet, interval=timedelta(seconds=2), name='async') planner = SchedulePlanner(link) planner.start()
import datetime import logging from databay import Link from databay.inlets import HttpInlet from databay.outlets import MongoOutlet from databay.planners import APSPlanner logging.getLogger('databay').setLevel(logging.DEBUG) # Create an inlet, outlet and a link. http_inlet = HttpInlet('https://jsonplaceholder.typicode.com/todos/1') mongo_outlet = MongoOutlet(database_name='databay', collection='test_collection') link = Link(http_inlet, mongo_outlet, datetime.timedelta(seconds=5), name='http_to_mongo') # Create a planner, add the link and start scheduling. planner = APSPlanner(link) planner.start()
import urllib.request class WeatherInlet(Inlet): def __init__(self, api_key: str, city_name: str, *args, **kwargs): super().__init__(*args, **kwargs) self.api_key = api_key self.city_name = city_name def pull(self, update) -> List[Record]: url = f'https://api.openweathermap.org/data/2.5/weather?' \ f'q={self.city_name}&' \ f'appid={self.api_key}' contents = urllib.request.urlopen(url).read().decode('utf8') formatted = json.loads(contents) return formatted['weather'][0]['description'] api_key = os.environ.get('OPEN_WEATHER_MAP_API_KEY') weather_inlet = WeatherInlet(api_key, 'Bangkok') link = Link(weather_inlet, PrintOutlet(only_payload=True), interval=timedelta(seconds=2), name='bangkok_weather') planner = APSPlanner(link) planner.start()
super().__init__() self.default_filepath = default_filepath self.default_file_mode = default_file_mode def push(self, records: [Record], update): for record in records: filepath = record.metadata.get(self.FILEPATH, self.default_filepath) file_mode = record.metadata.get(self.FILE_MODE, self.default_file_mode) with open(filepath, file_mode) as f: f.write(str(record.payload) + '\n') metadata = { FileOutlet.FILEPATH: 'outputs/random_ints.txt', FileOutlet.FILE_MODE: 'a' } random_int_inlet = RandomIntInlet(metadata=metadata) file_outlet = FileOutlet() link = Link(random_int_inlet, file_outlet, interval=timedelta(seconds=2), name='file_outlet') planner = APSPlanner(link) planner.start()
from databay import Link from databay.inlets import RandomIntInlet from databay.outlet import Outlet from databay.planners import SchedulePlanner from databay.record import Record class ConditionalPrintOutlet(Outlet): SHOULD_PRINT = 'ConditionalPrintOutlet.SHOULD_PRINT' """Whether records should be printed or skipped.""" def push(self, records: [Record], update): for record in records: if record.metadata.get(self.SHOULD_PRINT): print(update, record) random_int_inlet_on = RandomIntInlet( metadata={ConditionalPrintOutlet.SHOULD_PRINT: True}) random_int_inlet_off = RandomIntInlet( metadata={ConditionalPrintOutlet.SHOULD_PRINT: False}) print_outlet = ConditionalPrintOutlet() link = Link([random_int_inlet_on, random_int_inlet_off], print_outlet, interval=timedelta(seconds=0.5), name='should_print_metadata') planner = SchedulePlanner(link, refresh_interval=0.5) planner.start()
from databay.inlets import HttpInlet from databay.outlets import PrintOutlet from databay import Link from databay.planners import ApsPlanner from databay.outlets.file_outlet import FileOutlet # filter class BitcoinInlet(HttpInlet): async def pull(self, update): response = await super().pull(update) return response.get('USD').get('last') # produce # stock_inlet = HttpInlet('https://blockchain.info/ticker') stock_inlet = BitcoinInlet('https://blockchain.info/ticker') # consume print_outlet = PrintOutlet(True, True) file_outlet = FileOutlet('bitcoin_price_1s.txt') # transfer link = Link(stock_inlet, [print_outlet, file_outlet], interval=datetime.timedelta(seconds=1)) planner = ApsPlanner(link) planner.start()
from databay import Link from databay.outlets import PrintOutlet from databay.planners import SchedulePlanner from datetime import timedelta from databay import Inlet import random class RandomIntInlet(Inlet): def pull(self, update): return random.randint(0, 100) random_int_inlet = RandomIntInlet() print_outlet = PrintOutlet(only_payload=True) link = Link(random_int_inlet, print_outlet, interval=timedelta(seconds=5), name='random_ints') planner = SchedulePlanner(link) planner.start()
from datetime import timedelta from databay import Link from databay.inlets import RandomIntInlet from databay.planners import SchedulePlanner from databay.record import Record from databay.outlet import Outlet class PrintOutlet(Outlet): def push(self, records: [Record], update): for record in records: print(update, record.payload) random_int_inlet = RandomIntInlet() print_outlet = PrintOutlet() link = Link(random_int_inlet, print_outlet, interval=timedelta(seconds=2), tags='print_outlet') planner = SchedulePlanner(link) planner.start()