async def bulk_index(self, docs, namespace, params=None, chunk_size=None, doc_process=None, ): """ Insert multiple documents into Elasticsearch directly. :return: """ if not docs: return None if doc_process: docs = stream.map(docs, doc_process) docs = stream.map(docs, self._formatter.format_document) async def bulk(docs): succeed_total, failed_total = 0, 0 async for (succeed, failed) in self._chunk(actions=docs, chunk_size=self.chunk_size, params=params): succeed_total += len(succeed) failed_total += len(failed) self.monitor.increase_succeed(len(succeed)) self.monitor.increase_failed(len(failed)) logger.info('[Direct bulk] ns:%s succeed:%d' % (namespace, len(succeed))) if failed: logger.warning('[Direct bulk] ns:%s failed:%d' % (namespace, len(failed))) _, failed = await asyncio.ensure_future(self._failed_actions_commit(failed)) if not failed: logger.debug('Failed actions commit success') else: logger.warning('Failed actions commit failed') return succeed_total, failed_total return await bulk(stream.map(docs, lambda doc: self._gen_action(ElasticOperate.index, namespace, util.utc_now(), doc, False)[0]))
async def multi_resolver(collection: AsyncIOMotorCollection, watcher, pipeline=[]): documents: list = await find( collection, match=get_where(pipeline), ) documents = documents[:3] documents: dict = { doc['_id']: aggregate([doc], pipeline)[0] for doc in documents } yield list(documents.values()) def process(change): document = change['fullDocument'] _id = document['_id'] if change['operationType'] == 'insert': documents.update({_id: aggregate([document], pipeline)[0]}) return list(documents.values()) elif change['operationType'] == 'update': if _id in documents.keys(): documents[_id] = aggregate([document], pipeline)[0] return list(documents.values()) xs = stream.map(watcher, process, task_limit=1) xs = stream.filter(xs, bool) xs = window(xs, BATCH_INTERVAL) xs = last_per_window(xs, ) async for x in xs: yield list(x)
async def main(): db = AsyncIOMotorClient().db async def persist(updates, ): updates and await db[AGGREGATED_COLLECTION].bulk_write(updates) print('simulating long sleep') await asyncio.sleep(2) return 'done' batcher = Batcher(persist, interval=PERSIST_INTERVAL) def key(doc): return doc[ID_KEY] async def function(acc, document): return acc + 1 async def initializer(doc: dict): value = await db[AGGREGATED_COLLECTION].find_one( {AGGREGATED_ID_KEY: key(doc)}) value = value and value.get(AGGREGATED_KEY) return value or 0 # initializer = 0 # TODO rm xs = events(collection=db[EVENTS_COLLECTION]) xs = accumulate_by_key(xs, function, key=key, initializer=initializer) xs = stream.starmap(xs, make_db_operation) xs = stream.map( xs, batcher.push, ) # task_limit=1) # xs = window(xs, PERSIST_INTERVAL) # xs = stream.map(xs, take_last) # xs = stream.map(xs, list) # xs = stream.map(xs, lambda x: [z[1] for z in x]) # xs = stream.map(xs, persist, task_limit=1) xs = stream.map( xs, pretty, ) await asyncio.gather( store_some(db), xs, )
async def async_fetch_urlset(urls, download_dir, pbar=None, verbose=False): async with httpx.AsyncClient(http2=True) as session: ws = stream.repeat(session) xs = stream.zip(ws, stream.iterate(urls)) ys = stream.starmap(xs, fetch, ordered=False, task_limit=10) process_download = partial(process, download_dir=download_dir, pbar=pbar, verbose=verbose) zs = stream.map(ys, process_download) return await zs
async def async_fetch_urlset(urls, schedules, pbar=None, verbose=False, use_http2=True): async with httpx.AsyncClient(http2=use_http2) as session: ws = stream.repeat(session) xs = stream.zip(ws, stream.iterate(urls)) ys = stream.starmap(xs, fetch, ordered=False, task_limit=20) # 30 is similar IDK process = partial(process_soup, schedules=schedules, pbar=pbar, verbose=verbose) zs = stream.map(ys, process) return await zs
async def async_fetch_episodes(listings, pbar=None, verbose=False, use_http2=False): jsons = dict(zip(listings.broadcasts_urlset, listings.all_broadcasts)) limits = httpx.Limits(max_keepalive_connections=20) async with httpx.AsyncClient(http2=use_http2, limits=limits) as session: ws = stream.repeat(session) xs = stream.zip(ws, stream.iterate(listings.broadcasts_urlset)) ys = stream.starmap(xs, fetch, ordered=False, task_limit=20) # 20 is optimal process = partial(process_json, jsons=jsons, pbar=pbar, verbose=verbose) zs = stream.map(ys, process) return await zs
async def single_resolver(collection, watcher, pipeline=[]): initializer = await find_one(collection, get_where(pipeline) or {}) yield initializer xs = stream.filter( watcher, lambda change: change['operationType'] == 'update', ) xs = stream.map(xs, lambda change: change['fullDocument'], task_limit=1) xs = stream.filter(xs, lambda doc: doc['_id'] == initializer['_id']) xs = stream.concatmap(xs, lambda w: stream.iterate(aggregate([w], pipeline)), task_limit=1) xs = window(xs, BATCH_INTERVAL) xs = last_per_window(xs, ) # xs = last_per_window(xs, ) # xs = stream.map(xs, list) # xs = stream.concat(xs, ) async for x in xs: print(f'serving {prettify(x)}') yield x
async def async_get(self, urls): result = [] #TODO: Error checking async with aiohttp.ClientSession() as session: async def fetch(url): if self.logger: self.logger.log(f"Grabbing {url}") for _ in range(self.retry + 1): try: async with session.get(url) as resp: if resp.status == 200: return await resp.text() else: logging.error( f"Server returned error status {resp.status} on {url}" ) if self.logger: self.logger.log(f"Error on {url}") return "" except aiohttp.InvalidURL: logger.error(f"Invalid URL: {url} ") except aiohttp.ClientPayloadError: logging.error(f"Invalid payload") except Exception as e: logging.error(f"Unexpected error: {e}") return "" url_stream = stream.iterate(urls) html_stream = stream.map(url_stream, fetch, ordered=True, task_limit=10) async with html_stream.stream() as streamer: async for item in streamer: result.append(item) return result
def last_per_window(xs): xs = stream.map(xs, take_last) xs = stream.filter(xs, bool) return xs
async def execute(seed_prs: List[int], dry: bool = False, database_url: str = None) -> None: session = aiohttp.ClientSession() pr_stream = stream.map( aiter_opened_prs(seed_prs, session=session), partial(get_ofborg_eval, session=session), ordered=False, ) if dry: sqs_queues = None autoscaling = None else: sqs_queues = get_sqs() autoscaling = get_autoscaling() assert database_url is not None if database_url is not None: conn = await asyncpg.connect(database_url) await conn.execute(create_nixpkgs_review_dispatched_table_sql()) else: conn = None log.info("Setup", sqs=sqs_queues, autoscaling=autoscaling, conn=conn) async with pr_stream.stream() as streamer: async for event, ofborg_eval in streamer: pr = event["payload"]["number"] log.info("Main loop", pr=pr) if ofborg_eval is None: log.info( "Ofborg failed or no packages", pr=pr, ofborg_eval=ofborg_eval, failed=True, ) # Ofborg failed continue log.info("New buildable PR", pr=pr, ofborg_eval=ofborg_eval) await log_buildable_pr(conn, pr=pr, ofborg_eval=ofborg_eval) if sqs_queues is not None: for system in ALL_BUILD_SYSTEMS: if len(ofborg_eval["packages_per_system"].get( system, set())) == 0: log.info("Empty pull request", pr=pr, system=system) continue sqs_response = sqs_queues[system].send_message( # Message must be shorter than 2048 bytes, so don't pack # too much stuff in here MessageBody=json.dumps( dict( pr=pr, ofborg_url=ofborg_eval["url"], ))) if sqs_response["ResponseMetadata"][ "HTTPStatusCode"] != 200: log.error("SQS Response", response=sqs_response, pr=pr) else: log.info( "Skipping SQS submission", pr=pr, sqs_queues=sqs_queues, )
async def async_map(func, items): if isinstance(items, Stream): return stream.map(items, func) return stream.map(stream.iterate(items), func)