def get_file_list(account, **kwargs): queue = Queue() sem = BoundedSemaphore(FETCH_CONCURRENCY) done, working = set(), set() data = set() @gen.coroutine def fetch_url(): current_url = yield queue.get() try: if current_url in working: return page_no = working.__len__() app_log.info("Fetching page {}".format(page_no)) working.add(current_url) req = account.get_request(current_url) client = AsyncHTTPClient() response = yield client.fetch(req) done.add(current_url) app_log.info("Page {} downloaded".format(page_no)) response_data = json.loads(response.body.decode('utf-8')) for file in response_data: # be sure we're a valid file type and less than our maximum response size limit extension = file['path'].lower().split('.')[-1] if extension in VALID_FILETYPES and int( file['bytes']) < RESPONSE_SIZE_LIMIT * 1000000: data.add(( file['path'].lstrip('/'), file['path'], )) app_log.info("Page {} completed".format(page_no)) finally: queue.task_done() sem.release() @gen.coroutine def worker(): while True: yield sem.acquire() fetch_url() app_log.info("Gathering filelist for account {}".format(account._id)) for file_type in VALID_FILETYPES: file_type = '.'.join([file_type]) url = "https://api.dropbox.com/1/search/auto/?query={}&include_membership=true".format( file_type) queue.put(url) # start our concurrency worker worker() # wait until we're done yield queue.join(timeout=timedelta(seconds=MAXIMUM_REQ_TIME)) app_log.info("Finished list retrieval. Found {} items.".format( data.__len__())) return sorted([{ "title": title, "value": path } for title, path in data], key=lambda f: f['title'])
def run(args): if not args.test: ip_iter = _create_ip_iterator() else: ip_iter = _get_test_ips() good_ips = [] job_queue = Queue(maxsize=200) start = time.time() counter = Counter() @gen.coroutine def job_producer(): for ip in ip_iter: yield job_queue.put(ip) #print("Put {}".format(ip)) @gen.coroutine def worker(id): while True: ip = yield job_queue.get() try: good = yield test_ip(ip) counter['all'] += 1 if args.progress: if counter['all'] % 10000 == 0: print("Tested {} ips.".format(counter['all'])) if good: print("Found good ip: {}".format(ip)) counter['good'] += 1 if not args.test: yield record_good_ip(ip) else: good_ips.append(ip) finally: job_queue.task_done() for i in range(CONCURRENCY): worker(i) _disable_logging() try: yield job_producer() yield job_queue.join() finally: print("\n\nTested: {} ips\nFound {} good ips\nQps: {}".format( counter['all'], counter['good'], counter['all'] / (time.time() - start) )) if args.test and args.remove: with open(GOOD_IP_FILE + '_removed', 'w') as f: f.write('|'.join(good_ips))
class TornadoQuerierBase(object): def __init__(self): self.tasks = TornadoQueue() def gen_task(self): raise NotImplementError() def run_task(self, task): raise NotImplementError() def prepare(self): self.running = True def cleanup(self): self.running = False @coroutine def run_worker(self, worker_id, f): while self.tasks.qsize() > 0: task = yield self.tasks.get() LOG.debug('worker[%d]: current task is %s' % (worker_id, task)) try: yield f(task) pass except Exception as e: LOG.warning(str(e)) finally: self.tasks.task_done() task = None LOG.debug('worker[%d]: all tasks done %s' % (worker_id, self.tasks)) @coroutine def start(self, num_workers=1): self.prepare() # add tasks tasks = yield self.gen_task() for task in tasks: yield self.tasks.put(task) # start shoot workers for worker_id in range(num_workers): LOG.debug('starting worker %d' % worker_id) self.run_worker(worker_id, self.run_task) yield self.tasks.join() self.cleanup()
def get_file_list(account, **kwargs): queue = Queue() sem = BoundedSemaphore(FETCH_CONCURRENCY) done, working = set(), set() data = set() @gen.coroutine def fetch_url(): current_url = yield queue.get() try: if current_url in working: return page_no = working.__len__() app_log.info("Fetching page {}".format(page_no)) working.add(current_url) req = account.get_request(current_url) client = AsyncHTTPClient() response = yield client.fetch(req) done.add(current_url) app_log.info("Page {} downloaded".format(page_no)) response_data = json.loads(response.body.decode('utf-8')) for file in response_data: # be sure we're a valid file type and less than our maximum response size limit extension = file['path'].lower().split('.')[-1] if extension in VALID_FILETYPES and int(file['bytes']) < RESPONSE_SIZE_LIMIT * 1000000: data.add((file['path'].lstrip('/'), file['path'], )) app_log.info("Page {} completed".format(page_no)) finally: queue.task_done() sem.release() @gen.coroutine def worker(): while True: yield sem.acquire() fetch_url() app_log.info("Gathering filelist for account {}".format(account._id)) for file_type in VALID_FILETYPES: file_type = '.'.join([file_type]) url = "https://api.dropbox.com/1/search/auto/?query={}&include_membership=true".format(file_type) queue.put(url) # start our concurrency worker worker() # wait until we're done yield queue.join(timeout=timedelta(seconds=MAXIMUM_REQ_TIME)) app_log.info("Finished list retrieval. Found {} items.".format(data.__len__())) return sorted([{"title": title, "value": path} for title, path in data], key=lambda f: f['title'])
def main(): cocurrency = 10 queue = Queue() queue.put("http://www.jianshu.com") workers = [] for _ in range(cocurrency): workers.append(Worker(app, queue)) for worker in workers: Log4Spider.debugLog("worker begin:", worker) worker.run() Log4Spider.debugLog("waitiing for spiderQueue empty:") yield queue.join(timeout=timedelta(seconds=300)) Log4Spider.debugLog("main done!")
class FirehoseWebSocket(tornado.websocket.WebSocketHandler): @tornado.gen.coroutine def open(self): print "hose open" global queues self.queue = Queue() queues.append(self.queue) while True: item = yield self.queue.get() self.queue.task_done() self.write_message(json.dumps(item)) @tornado.gen.coroutine def on_close(self): global queues yield self.queue.join() queues.remove(self.queue)
def main(): cocurrency = 10 queue = Queue() queue.put("http://www.jianshu.com") workers = [] for _ in range(cocurrency): workers.append(Worker(app,queue)) for worker in workers: Log4Spider.debugLog("worker begin:",worker) worker.run() Log4Spider.debugLog("waitiing for spiderQueue empty:") yield queue.join(timeout=timedelta(seconds=300)) Log4Spider.debugLog("main done!")
class TornadoPikaPublisher(BeergardenPublisher, PikaClient): def __init__(self, **kwargs): self.logger = logging.getLogger(__name__) self._shutdown_timeout = timedelta( seconds=kwargs.pop("shutdown_timeout", 5)) self._work_queue = Queue() self._connection = None self._channel = None self.coroutiner = CoroutineMaker({ "TornadoConnection": "on_open_callback", "channel": "on_open_callback" }) # Trying to get super() to work with incompatible signatures is a nightmare BeergardenPublisher.__init__(self) PikaClient.__init__(self, **kwargs) IOLoop.current().spawn_callback(self._process) def shutdown(self): return self._work_queue.join(timeout=self._shutdown_timeout) @coroutine def _open_connection(self): self._connection = yield self.coroutiner.convert(TornadoConnection)( parameters=self._conn_params) @coroutine def _open_channel(self): self._channel = yield self.coroutiner.convert( self._connection.channel)() @coroutine def _process(self): while True: item = yield self._work_queue.get() try: if not self._connection or not self._connection.is_open: yield self._open_connection() if not self._channel or not self._channel.is_open: yield self._open_channel() yield getattr(self._channel, item[0])(**item[1]) finally: self._work_queue.task_done() def publish(self, message, **kwargs): """Publish a message. :param message: The message to publish :param kwargs: Additional message properties :Keyword Arguments: * *routing_key* -- Routing key to use when publishing * *headers* -- Headers to be included as part of the message properties * *expiration* -- Expiration to be included as part of the message properties :return: None """ self._work_queue.put(( "basic_publish", { "exchange": self._exchange, "routing_key": kwargs["routing_key"], "body": message, "properties": BasicProperties( app_id="beer-garden", content_type="text/plain", headers=kwargs.pop("headers", None), expiration=kwargs.pop("expiration", None), ), }, )) def _event_publish_args(self, event, **kwargs): # Main thing we need to do here is figure out the appropriate routing key args = {} if event.metadata and "routing_key" in event.metadata: args["routing_key"] = event.metadata["routing_key"] elif "request" in kwargs: request = kwargs["request"] args["routing_key"] = get_routing_key("request", request.system, request.system_version, request.instance_name) else: args["routing_key"] = "beergarden" return args
def get_file_list(account, **kwargs): queue = Queue() sem = BoundedSemaphore(FETCH_CONCURRENCY) done, working = set(), set() data = set() request_params = { 'type': 'file', 'limit': 200, 'size_range': ',{}'.format(RESPONSE_SIZE_LIMIT * 1000000), 'file_extensions': ','.join(VALID_FILETYPES) } qs = '&'.join([ "{}={}".format(key, value) for key, value in request_params.items() ]) @gen.coroutine def fetch_url(): current_url = yield queue.get() try: if current_url in working: return page_no = working.__len__() app_log.info("Fetching page {}".format(page_no)) working.add(current_url) req = account.get_request(current_url) client = AsyncHTTPClient() response = yield client.fetch(req) done.add(current_url) app_log.info("Page {} downloaded".format(page_no)) response_data = json.loads(response.body.decode('utf-8')) for file in response_data.get('entries', []): file_entry = (('/'.join([ path['name'] for path in file['path_collection']['entries'] if path['id'] != '0' ] + [file['name']])).lstrip('/'), file['id']) # be sure we're a valid file type and less than our maximum response size limit extension = file['name'].lower().split('.')[-1] if extension in VALID_FILETYPES: data.add(file_entry) app_log.info("Page {} completed".format(page_no)) finally: queue.task_done() sem.release() @gen.coroutine def worker(): while True: yield sem.acquire() fetch_url() app_log.info("Gathering filelist for account {}".format(account._id)) for file_type in VALID_FILETYPES: file_type = '.'.join([file_type]) url = "https://api.box.com/2.0/search?query={}&{}".format( file_type, qs) queue.put(url) # start our concurrency worker worker() # wait until we're done yield queue.join(timeout=timedelta(seconds=MAXIMUM_REQ_TIME)) app_log.info("Finished list retrieval. Found {} items.".format( data.__len__())) return sorted([{ "title": title, "value": path } for title, path in data], key=lambda f: f['title'])
class AsynSpider(MySpider): def __init__(self, out, **kwargs): super(AsynSpider, self).__init__(out, **kwargs) self.client = httpclient.AsyncHTTPClient() self.q = Queue() self.fetching, self.fetched = set(), set() def assign_jobs(self, jobs): for job in jobs: self.q.put(job) @gen.coroutine def run(self): if self.q.empty(): url = LIST_URL + urllib.urlencode(self.list_query) self.q.put(url) for _ in range(CONCURRENCY): self.worker() yield self.q.join() assert self.fetching == self.fetched # print len(self.fetched) if isinstance(self._out, Analysis): self._out.finish() @gen.coroutine def worker(self): while True: yield self.fetch_url() @gen.coroutine def fetch_url(self): current_url = yield self.q.get() try: if current_url in self.fetching: return self.fetching.add(current_url) request = httpclient.HTTPRequest(current_url, headers=HEADERS) resp = yield self.client.fetch(request) self.fetched.add(current_url) xml = etree.fromstring(resp.body) has_total_count = xml.xpath("//totalcount/text()") if has_total_count: # 非空证明为列表,否则为详细页 total_count = int(has_total_count[0]) if total_count == 0: return # 列表跨界 if self.list_query["pageno"] == 1: pageno = 2 while pageno < 10: # while pageno <= total_count / PAGE_SIZE: self.list_query["pageno"] = pageno next_list_url = LIST_URL + urllib.urlencode( self.list_query) self.q.put(next_list_url) # logging.info(next_list_url) pageno += 1 job_ids = xml.xpath("//jobid/text()") job_detail_urls = [] for ID in job_ids: new_detail_query = DETAIL_QUERY.copy() new_detail_query["jobid"] = ID job_detail_urls.append(DETAIL_URL + urllib.urlencode(new_detail_query)) for detail_url in job_detail_urls: self.q.put(detail_url) # logging.info(detail_url) else: self._out.collect(xml) finally: self.q.task_done()
def get_file_list(account, **kwargs): queue = Queue() sem = BoundedSemaphore(FETCH_CONCURRENCY) done, working = set(), set() data = [] ids = set() @gen.coroutine def fetch_url(): current_url = yield queue.get() try: if current_url in working: return page_no = working.__len__() app_log.info("Fetching page {}".format(page_no)) working.add(current_url) req = account.get_request(current_url) client = AsyncHTTPClient() response = yield client.fetch(req) done.add(current_url) app_log.info("Page {} downloaded".format(page_no)) response_data = json.loads(response.body.decode("utf-8")) url = response_data.get("@odata.nextLink", None) if url is not None: queue.put(url) for file in response_data.get("value", []): if file["name"][-4:].strip(".").lower() in VALID_FILETYPES: if file["id"] not in ids: ids.add(file["id"]) data.append( { "title": file["parentReference"]["path"].split(":")[1].lstrip("/") + "/" + file["name"], "value": file["id"], } ) app_log.info("Page {} completed".format(page_no)) finally: queue.task_done() sem.release() @gen.coroutine def worker(): while True: yield sem.acquire() fetch_url() app_log.info("Gathering filelist for account {}".format(account._id)) for file_type in VALID_FILETYPES: file_type = ".".join([file_type]) url = "https://api.onedrive.com/v1.0/drive/root/view.search?top=1000&select=parentReference,name,id,size&q={}".format( file_type ) queue.put(url) # start our concurrency worker worker() # wait until we're done yield queue.join(timeout=timedelta(seconds=MAXIMUM_REQ_TIME)) app_log.info("Finished list retrieval. Found {} items.".format(data.__len__())) return sorted(data, key=lambda f: f["title"])
def get_data(cls, account, source_filter, limit=100, skip=0): """ Gathers commit information from GH GET https://api.github.com/repos/:owner/:repo/commits Header: Accept: application/vnd.github.v3+json """ if not account or not account.enabled: raise ValueError('cannot gather information without a valid account') client = AsyncHTTPClient() source_filter = GitHubRepositoryDateFilter(source_filter) if source_filter.repository is None: raise ValueError('required parameter projects missing') default_headers = {"Content-Type": "application/json", "Accept": "application/vnd.github.v3+json"} # first we grab our list of commits uri = "https://api.github.com/repos/{}/commits".format(source_filter.repository) qs = source_filter.get_qs() if qs != '': uri = uri + '?' + qs app_log.info("Starting retrieval of commit list for account {}".format(account._id)) if limit is not None and limit <= 100: # we can handle our limit right here uri += "?per_page={}".format(limit) elif limit is None: uri += "?per_page=100" # maximum number per page for GitHub API taken = 0 queue = Queue() sem = BoundedSemaphore(FETCH_CONCURRENCY) done, working = set(), set() while uri is not None: app_log.info( "({}) Retrieving next page, received {} commits thus far".format(account._id, taken)) req = account.get_request(uri, headers=default_headers) response = yield client.fetch(req) page_data = json.loads(response.body.decode('utf-8')) taken += page_data.__len__() for item in page_data: queue.put(item.get('url', None)) if limit is None or taken < limit: # parse the Link header from GitHub (https://developer.github.com/v3/#pagination) links = parse_link_header(response.headers.get('Link', '')) uri = links.get('next', None) else: break if queue.qsize() > 500: raise HTTPError(413, 'too many commits') app_log.info("({}) Commit list retrieved, fetching info for {} commits".format(account._id, taken)) # open our list cls.write('[') # our worker to actually fetch the info @gen.coroutine def fetch_url(): current_url = yield queue.get() try: if current_url in working: return page_no = working.__len__() app_log.info("Fetching page {}".format(page_no)) working.add(current_url) req = account.get_request(current_url) client = AsyncHTTPClient() response = yield client.fetch(req) response_data = json.loads(response.body.decode('utf-8')) obj = { 'date': response_data['commit']['author']['date'], 'author': response_data['commit']['author']['name'], 'added_files': [file for file in response_data['files'] if file['status'] == 'added'].__len__(), 'deleted_files': [file for file in response_data['files'] if file['status'] == 'deleted'].__len__(), 'modified_files': [file for file in response_data['files'] if file['status'] == 'modified'].__len__(), 'additions': response_data['stats']['additions'], 'deletions': response_data['stats']['deletions'] } if done.__len__() > 0: cls.write(',') cls.write(json.dumps(obj)) done.add(current_url) app_log.info("Page {} downloaded".format(page_no)) finally: queue.task_done() sem.release() @gen.coroutine def worker(): while True: yield sem.acquire() fetch_url() # start our concurrency worker worker() try: # wait until we're done yield queue.join(timeout=timedelta(seconds=MAXIMUM_REQ_TIME)) except gen.TimeoutError: app_log.warning("Request exceeds maximum time, cutting response short") finally: # close our list cls.write(']') app_log.info("Finished retrieving commits for {}".format(account._id))
class ProjectGroomer(object): """ Cleans up expired transactions for a project. """ def __init__(self, project_id, coordinator, zk_client, db_access, thread_pool): """ Creates a new ProjectGroomer. Args: project_id: A string specifying a project ID. coordinator: A GroomingCoordinator. zk_client: A KazooClient. db_access: A DatastoreProxy. thread_pool: A ThreadPoolExecutor. """ self.project_id = project_id self._coordinator = coordinator self._zk_client = zk_client self._tornado_zk = TornadoKazoo(self._zk_client) self._db_access = db_access self._thread_pool = thread_pool self._project_node = '/appscale/apps/{}'.format(self.project_id) self._containers = [] self._inactive_containers = set() self._batch_resolver = BatchResolver(self.project_id, self._db_access) self._zk_client.ensure_path(self._project_node) self._zk_client.ChildrenWatch(self._project_node, self._update_containers) self._txid_manual_offset = 0 self._offset_node = '/'.join([self._project_node, OFFSET_NODE]) self._zk_client.DataWatch(self._offset_node, self._update_offset) self._stop_event = AsyncEvent() self._stopped_event = AsyncEvent() # Keeps track of cleanup results for each round of grooming. self._txids_cleaned = 0 self._oldest_valid_tx_time = None self._worker_queue = AsyncQueue(maxsize=MAX_CONCURRENCY) for _ in range(MAX_CONCURRENCY): IOLoop.current().spawn_callback(self._worker) IOLoop.current().spawn_callback(self.start) @gen.coroutine def start(self): """ Starts the grooming process until the stop event is set. """ logger.info('Grooming {}'.format(self.project_id)) while True: if self._stop_event.is_set(): break try: yield self._groom_project() except Exception: # Prevent the grooming loop from stopping if an error is encountered. logger.exception('Unexpected error while grooming {}'.format( self.project_id)) yield gen.sleep(MAX_TX_DURATION) self._stopped_event.set() @gen.coroutine def stop(self): """ Stops the grooming process. """ logger.info('Stopping grooming process for {}'.format(self.project_id)) self._stop_event.set() yield self._stopped_event.wait() @gen.coroutine def _worker(self): """ Processes items in the worker queue. """ while True: tx_path, composite_indexes = yield self._worker_queue.get() try: tx_time = yield self._resolve_txid(tx_path, composite_indexes) if tx_time is None: self._txids_cleaned += 1 if tx_time is not None and tx_time < self._oldest_valid_tx_time: self._oldest_valid_tx_time = tx_time except Exception: logger.exception( 'Unexpected error while resolving {}'.format(tx_path)) finally: self._worker_queue.task_done() def _update_offset(self, new_offset, _): """ Watches for updates to the manual offset node. Args: new_offset: A string specifying the new manual offset. """ self._txid_manual_offset = int(new_offset or 0) def _update_containers(self, nodes): """ Updates the list of active txid containers. Args: nodes: A list of strings specifying ZooKeeper nodes. """ counters = [ int(node[len(CONTAINER_PREFIX):] or 1) for node in nodes if node.startswith(CONTAINER_PREFIX) and node not in self._inactive_containers ] counters.sort() containers = [CONTAINER_PREFIX + str(counter) for counter in counters] if containers and containers[0] == '{}1'.format(CONTAINER_PREFIX): containers[0] = CONTAINER_PREFIX self._containers = containers @gen.coroutine def _groom_project(self): """ Runs the grooming process. """ index = self._coordinator.index worker_count = self._coordinator.total_workers oldest_valid_tx_time = yield self._fetch_and_clean(index, worker_count) # Wait until there's a reasonable chance that some transactions have # timed out. next_timeout_eta = oldest_valid_tx_time + MAX_TX_DURATION # The oldest ignored transaction should still be valid, but ensure that # the timeout is not negative. next_timeout = max(0, next_timeout_eta - time.time()) time_to_wait = datetime.timedelta(seconds=next_timeout + (MAX_TX_DURATION / 2)) # Allow the wait to be cut short when a project is removed. try: yield self._stop_event.wait(timeout=time_to_wait) except gen.TimeoutError: return @gen.coroutine def _remove_locks(self, txid, tx_path): """ Removes entity locks involved with the transaction. Args: txid: An integer specifying the transaction ID. tx_path: A string specifying the location of the transaction node. """ groups_path = '/'.join([tx_path, 'groups']) try: groups_data = yield self._tornado_zk.get(groups_path) except NoNodeError: # If the group list does not exist, the locks have not been acquired. return group_paths = json.loads(groups_data[0]) for group_path in group_paths: try: contenders = yield self._tornado_zk.get_children(group_path) except NoNodeError: # The lock may have been cleaned up or not acquired in the first place. continue for contender in contenders: contender_path = '/'.join([group_path, contender]) contender_data = yield self._tornado_zk.get(contender_path) contender_txid = int(contender_data[0]) if contender_txid != txid: continue yield self._tornado_zk.delete(contender_path) break @gen.coroutine def _remove_path(self, tx_path): """ Removes a ZooKeeper node. Args: tx_path: A string specifying the path to delete. """ try: yield self._tornado_zk.delete(tx_path) except NoNodeError: pass except NotEmptyError: yield self._thread_pool.submit(self._zk_client.delete, tx_path, recursive=True) @gen.coroutine def _resolve_txid(self, tx_path, composite_indexes): """ Cleans up a transaction if it has expired. Args: tx_path: A string specifying the location of the ZooKeeper node. composite_indexes: A list of CompositeIndex objects. Returns: The transaction start time if still valid, None if invalid because this method will also delete it. """ try: tx_data = yield self._tornado_zk.get(tx_path) except NoNodeError: return tx_time = float(tx_data[0]) _, container, tx_node = tx_path.rsplit('/', 2) tx_node_id = int(tx_node.lstrip(COUNTER_NODE_PREFIX)) container_count = int(container[len(CONTAINER_PREFIX):] or 1) if tx_node_id < 0: yield self._remove_path(tx_path) return container_size = MAX_SEQUENCE_COUNTER + 1 automatic_offset = (container_count - 1) * container_size txid = self._txid_manual_offset + automatic_offset + tx_node_id if txid < 1: yield self._remove_path(tx_path) return # If the transaction is still valid, return the time it was created. if tx_time + MAX_TX_DURATION >= time.time(): raise gen.Return(tx_time) yield self._batch_resolver.resolve(txid, composite_indexes) yield self._remove_locks(txid, tx_path) yield self._remove_path(tx_path) yield self._batch_resolver.cleanup(txid) @gen.coroutine def _fetch_and_clean(self, worker_index, worker_count): """ Cleans up expired transactions. Args: worker_index: An integer specifying this worker's index. worker_count: An integer specifying the number of total workers. Returns: A float specifying the time of the oldest valid transaction as a unix timestamp. """ self._txids_cleaned = 0 self._oldest_valid_tx_time = time.time() children = [] for index, container in enumerate(self._containers): container_path = '/'.join([self._project_node, container]) new_children = yield self._tornado_zk.get_children(container_path) if not new_children and index < len(self._containers) - 1: self._inactive_containers.add(container) children.extend( ['/'.join([container_path, node]) for node in new_children]) logger.debug('Found {} transaction IDs for {}'.format( len(children), self.project_id)) if not children: raise gen.Return(self._oldest_valid_tx_time) # Refresh these each time so that the indexes are fresh. encoded_indexes = yield self._thread_pool.submit( self._db_access.get_indices, self.project_id) composite_indexes = [ CompositeIndex(index) for index in encoded_indexes ] for tx_path in children: tx_node_id = int( tx_path.split('/')[-1].lstrip(COUNTER_NODE_PREFIX)) # Only resolve transactions that this worker has been assigned. if tx_node_id % worker_count != worker_index: continue yield self._worker_queue.put((tx_path, composite_indexes)) yield self._worker_queue.join() if self._txids_cleaned > 0: logger.info('Cleaned up {} expired txids for {}'.format( self._txids_cleaned, self.project_id)) raise gen.Return(self._oldest_valid_tx_time)
class Scraper(): def __init__(self, request_params=[{}], max_clients=100, maxsize=100, connect_timeout=9999999, request_timeout=9999999, auth_username=None, auth_password=None, method='GET', func=None, sleep=0, endpoint=None): self.sleep = sleep self.endpoint = endpoint """Instantiate a tornado async http client to do multiple concurrent requests""" self.max_clients = max_clients AsyncHTTPClient.configure( "tornado.simple_httpclient.SimpleAsyncHTTPClient", max_clients=self.max_clients) self.request_params = request_params self.method = method self.maxsize = maxsize self.auth_username = auth_username self.auth_password = auth_password self.connect_timeout = connect_timeout self.request_timeout = request_timeout self.to_return = [] self.http_client = AsyncHTTPClient() self.queue = Queue(maxsize=self.maxsize) self.func = func self.read(self.request_params) self.get(self.connect_timeout, self.request_timeout, self.http_client) self.loop = ioloop.IOLoop.current() self.join_future = self.queue.join() def done(future): self.loop.stop() self.join_future.add_done_callback(done) self.loop.start() @gen.coroutine def read(self, request_params): for request_param in request_params: yield self.queue.put(request_param) @gen.coroutine def get(self, connect_timeout, request_timeout, http_client): print("Getting Links") self.counter = 1 while True: request_param = yield self.queue.get() url = request_param.get('url', self.endpoint) body = request_param.get('body', None) dictKey = request_param['dictKey'] # request_param['headers']['dictKey'] = dictKey request = CustomHTTPRequest(url, method=self.method, headers=request_param['headers'], body=body, connect_timeout=connect_timeout, request_timeout=request_timeout, auth_username=self.auth_username, auth_password=self.auth_password, key=dictKey) def handle_response(response): if not self.func: if response.error: self.to_return.append({ 'key': response.request.__dict__['key'], 'response': str(response.error) }) else: self.to_return.append({ 'key': response.request.__dict__['key'], 'response': response.body }) else: try: self.func(response.body, response.request.__dict__['key']) except Exception as e: pass # print(self.counter) self.counter += 1 self.queue.task_done() future = self.http_client.fetch(request, handle_response) time.sleep(self.sleep) def return_results(self): return self.to_return
def get_file_list(account, **kwargs): queue = Queue() sem = BoundedSemaphore(FETCH_CONCURRENCY) done, working = set(), set() data = [] ids = set() @gen.coroutine def fetch_url(): current_url = yield queue.get() try: if current_url in working: return page_no = working.__len__() app_log.info("Fetching page {}".format(page_no)) working.add(current_url) req = account.get_request(current_url) client = AsyncHTTPClient() response = yield client.fetch(req) done.add(current_url) app_log.info("Page {} downloaded".format(page_no)) response_data = json.loads(response.body.decode('utf-8')) url = response_data.get('@odata.nextLink', None) if url is not None: queue.put(url) for file in response_data.get('value', []): if file['name'][-4:].strip('.').lower() in VALID_FILETYPES: if file['id'] not in ids: ids.add(file['id']) data.append({ "title": file['parentReference']['path'].split(':') [1].lstrip('/') + '/' + file['name'], "value": file['id'] }) app_log.info("Page {} completed".format(page_no)) finally: queue.task_done() sem.release() @gen.coroutine def worker(): while True: yield sem.acquire() fetch_url() app_log.info("Gathering filelist for account {}".format(account._id)) for file_type in VALID_FILETYPES: file_type = '.'.join([file_type]) url = "https://api.onedrive.com/v1.0/drive/root/view.search?top=1000&select=parentReference,name,id,size&q={}" \ .format(file_type) queue.put(url) # start our concurrency worker worker() # wait until we're done yield queue.join(timeout=timedelta(seconds=MAXIMUM_REQ_TIME)) app_log.info("Finished list retrieval. Found {} items.".format( data.__len__())) return sorted(data, key=lambda f: f['title'])
class SQSDrain(object): """Implementation of IDrain that writes to an AWS SQS queue. """ def __init__(self, logger, loop, sqs_client, metric_prefix='emitter'): self.emitter = sqs_client self.logger = logger self.loop = loop self.metric_prefix = metric_prefix self.output_error = Event() self.state = RUNNING self.sender_tag = 'sender:%s.%s' % (self.__class__.__module__, self.__class__.__name__) self._send_queue = Queue() self._should_flush_queue = Event() self._flush_handle = None self.loop.spawn_callback(self._onSend) @gen.coroutine def _flush_send_batch(self, batch_size): send_batch = [ self._send_queue.get_nowait() for pos in range(min(batch_size, self.emitter.max_messages)) ] try: response = yield self.emitter.send_message_batch(*send_batch) except SQSError as err: self.logger.exception('Error encountered flushing data to SQS: %s', err) self.output_error.set() for msg in send_batch: self._send_queue.put_nowait(msg) else: if response.Failed: self.output_error.set() for req in response.Failed: self.logger.error('Message failed to send: %s', req.Id) self._send_queue.put_nowait(req) @gen.coroutine def _onSend(self): respawn = True while respawn: qsize = self._send_queue.qsize() # This will keep flushing until clear, # including items that show up in between flushes while qsize > 0: try: yield self._flush_send_batch(qsize) except Exception as err: self.logger.exception(err) self.output_error.set() qsize = self._send_queue.qsize() # We've cleared the backlog, remove any possible future flush if self._flush_handle: self.loop.remove_timeout(self._flush_handle) self._flush_handle = None self._should_flush_queue.clear() yield self._should_flush_queue.wait() @gen.coroutine def close(self, timeout=None): self.state = CLOSING yield self._send_queue.join(timeout) def emit_nowait(self, msg): if self._send_queue.qsize() >= self.emitter.max_messages: # Signal flush self._should_flush_queue.set() raise QueueFull() elif self._flush_handle is None: # Ensure we flush messages at least by MAX_TIMEOUT self._flush_handle = self.loop.add_timeout( MAX_TIMEOUT, lambda: self._should_flush_queue.set(), ) self.logger.debug("Drain emitting") self._send_queue.put_nowait(msg) @gen.coroutine def emit(self, msg, timeout=None): if self._send_queue.qsize() >= self.emitter.max_messages: # Signal flush self._should_flush_queue.set() elif self._flush_handle is None: # Ensure we flush messages at least by MAX_TIMEOUT self._flush_handle = self.loop.add_timeout( MAX_TIMEOUT, lambda: self._should_flush_queue.set(), ) yield self._send_queue.put(msg, timeout)
class ProjectGroomer(object): """ Cleans up expired transactions for a project. """ def __init__(self, project_id, coordinator, zk_client, db_access, thread_pool): """ Creates a new ProjectGroomer. Args: project_id: A string specifying a project ID. coordinator: A GroomingCoordinator. zk_client: A KazooClient. db_access: A DatastoreProxy. thread_pool: A ThreadPoolExecutor. """ self.project_id = project_id self._coordinator = coordinator self._zk_client = zk_client self._tornado_zk = TornadoKazoo(self._zk_client) self._db_access = db_access self._thread_pool = thread_pool self._project_node = '/appscale/apps/{}'.format(self.project_id) self._containers = [] self._inactive_containers = set() self._batch_resolver = BatchResolver(self.project_id, self._db_access) self._zk_client.ensure_path(self._project_node) self._zk_client.ChildrenWatch(self._project_node, self._update_containers) self._txid_manual_offset = 0 self._offset_node = '/'.join([self._project_node, OFFSET_NODE]) self._zk_client.DataWatch(self._offset_node, self._update_offset) self._stop_event = AsyncEvent() self._stopped_event = AsyncEvent() # Keeps track of cleanup results for each round of grooming. self._txids_cleaned = 0 self._oldest_valid_tx_time = None self._worker_queue = AsyncQueue(maxsize=MAX_CONCURRENCY) for _ in range(MAX_CONCURRENCY): IOLoop.current().spawn_callback(self._worker) IOLoop.current().spawn_callback(self.start) @gen.coroutine def start(self): """ Starts the grooming process until the stop event is set. """ logger.info('Grooming {}'.format(self.project_id)) while True: if self._stop_event.is_set(): break try: yield self._groom_project() except Exception: # Prevent the grooming loop from stopping if an error is encountered. logger.exception( 'Unexpected error while grooming {}'.format(self.project_id)) yield gen.sleep(MAX_TX_DURATION) self._stopped_event.set() @gen.coroutine def stop(self): """ Stops the grooming process. """ logger.info('Stopping grooming process for {}'.format(self.project_id)) self._stop_event.set() yield self._stopped_event.wait() @gen.coroutine def _worker(self): """ Processes items in the worker queue. """ while True: tx_path, composite_indexes = yield self._worker_queue.get() try: tx_time = yield self._resolve_txid(tx_path, composite_indexes) if tx_time is None: self._txids_cleaned += 1 if tx_time is not None and tx_time < self._oldest_valid_tx_time: self._oldest_valid_tx_time = tx_time finally: self._worker_queue.task_done() def _update_offset(self, new_offset, _): """ Watches for updates to the manual offset node. Args: new_offset: A string specifying the new manual offset. """ self._txid_manual_offset = int(new_offset or 0) def _update_containers(self, nodes): """ Updates the list of active txid containers. Args: nodes: A list of strings specifying ZooKeeper nodes. """ counters = [int(node[len(CONTAINER_PREFIX):] or 1) for node in nodes if node.startswith(CONTAINER_PREFIX) and node not in self._inactive_containers] counters.sort() containers = [CONTAINER_PREFIX + str(counter) for counter in counters] if containers and containers[0] == '{}1'.format(CONTAINER_PREFIX): containers[0] = CONTAINER_PREFIX self._containers = containers @gen.coroutine def _groom_project(self): """ Runs the grooming process. """ index = self._coordinator.index worker_count = self._coordinator.total_workers oldest_valid_tx_time = yield self._fetch_and_clean(index, worker_count) # Wait until there's a reasonable chance that some transactions have # timed out. next_timeout_eta = oldest_valid_tx_time + MAX_TX_DURATION # The oldest ignored transaction should still be valid, but ensure that # the timeout is not negative. next_timeout = max(0, next_timeout_eta - time.time()) time_to_wait = datetime.timedelta( seconds=next_timeout + (MAX_TX_DURATION / 2)) # Allow the wait to be cut short when a project is removed. try: yield self._stop_event.wait(timeout=time_to_wait) except gen.TimeoutError: raise gen.Return() @gen.coroutine def _remove_path(self, tx_path): """ Removes a ZooKeeper node. Args: tx_path: A string specifying the path to delete. """ try: yield self._tornado_zk.delete(tx_path) except NoNodeError: pass except NotEmptyError: yield self._thread_pool.submit(self._zk_client.delete, tx_path, recursive=True) @gen.coroutine def _resolve_txid(self, tx_path, composite_indexes): """ Cleans up a transaction if it has expired. Args: tx_path: A string specifying the location of the ZooKeeper node. composite_indexes: A list of CompositeIndex objects. Returns: The transaction start time if still valid, None if invalid because this method will also delete it. """ tx_data = yield self._tornado_zk.get(tx_path) tx_time = float(tx_data[0]) _, container, tx_node = tx_path.rsplit('/', 2) tx_node_id = int(tx_node.lstrip(COUNTER_NODE_PREFIX)) container_count = int(container[len(CONTAINER_PREFIX):] or 1) if tx_node_id < 0: yield self._remove_path(tx_path) raise gen.Return() container_size = MAX_SEQUENCE_COUNTER + 1 automatic_offset = (container_count - 1) * container_size txid = self._txid_manual_offset + automatic_offset + tx_node_id if txid < 1: yield self._remove_path(tx_path) raise gen.Return() # If the transaction is still valid, return the time it was created. if tx_time + MAX_TX_DURATION >= time.time(): raise gen.Return(tx_time) yield self._batch_resolver.resolve(txid, composite_indexes) yield self._remove_path(tx_path) yield self._batch_resolver.cleanup(txid) @gen.coroutine def _fetch_and_clean(self, worker_index, worker_count): """ Cleans up expired transactions. Args: worker_index: An integer specifying this worker's index. worker_count: An integer specifying the number of total workers. Returns: A float specifying the time of the oldest valid transaction as a unix timestamp. """ self._txids_cleaned = 0 self._oldest_valid_tx_time = time.time() children = [] for index, container in enumerate(self._containers): container_path = '/'.join([self._project_node, container]) new_children = yield self._tornado_zk.get_children(container_path) if not new_children and index < len(self._containers) - 1: self._inactive_containers.add(container) children.extend(['/'.join([container_path, node]) for node in new_children]) logger.debug( 'Found {} transaction IDs for {}'.format(len(children), self.project_id)) if not children: raise gen.Return(self._oldest_valid_tx_time) # Refresh these each time so that the indexes are fresh. encoded_indexes = yield self._thread_pool.submit( self._db_access.get_indices, self.project_id) composite_indexes = [CompositeIndex(index) for index in encoded_indexes] for tx_path in children: tx_node_id = int(tx_path.split('/')[-1].lstrip(COUNTER_NODE_PREFIX)) # Only resolve transactions that this worker has been assigned. if tx_node_id % worker_count != worker_index: continue yield self._worker_queue.put((tx_path, composite_indexes)) yield self._worker_queue.join() if self._txids_cleaned > 0: logger.info('Cleaned up {} expired txids for {}'.format( self._txids_cleaned, self.project_id)) raise gen.Return(self._oldest_valid_tx_time)
class AsyncTaskManager(object): """ Aucote uses asynchronous task executed in ioloop. Some of them, especially scanners, should finish before ioloop will stop This class should be accessed by instance class method, which returns global instance of task manager """ _instances = {} TASKS_POLITIC_WAIT = 0 TASKS_POLITIC_KILL_WORKING_FIRST = 1 TASKS_POLITIC_KILL_PROPORTIONS = 2 TASKS_POLITIC_KILL_WORKING = 3 def __init__(self, parallel_tasks=10): self._shutdown_condition = Event() self._stop_condition = Event() self._cron_tasks = {} self._parallel_tasks = parallel_tasks self._tasks = Queue() self._task_workers = {} self._events = {} self._limit = self._parallel_tasks self._next_task_number = 0 self._toucan_keys = {} @classmethod def instance(cls, name=None, **kwargs): """ Return instance of AsyncTaskManager Returns: AsyncTaskManager """ if cls._instances.get(name) is None: cls._instances[name] = AsyncTaskManager(**kwargs) return cls._instances[name] @property def shutdown_condition(self): """ Event which is resolved if every job is done and AsyncTaskManager is ready to shutdown Returns: Event """ return self._shutdown_condition def start(self): """ Start CronTabCallback tasks Returns: None """ for task in self._cron_tasks.values(): task.start() for number in range(self._parallel_tasks): self._task_workers[number] = IOLoop.current().add_callback( partial(self.process_tasks, number)) self._next_task_number = self._parallel_tasks def add_crontab_task(self, task, cron, event=None): """ Add function to scheduler and execute at cron time Args: task (function): cron (str): crontab value event (Event): event which prevent from running task with similar aim, eg. security scans Returns: None """ if event is not None: event = self._events.setdefault(event, Event()) self._cron_tasks[task] = AsyncCrontabTask(cron, task, event) @gen.coroutine def stop(self): """ Stop CronTabCallback tasks and wait on them to finish Returns: None """ for task in self._cron_tasks.values(): task.stop() IOLoop.current().add_callback(self._prepare_shutdown) yield [self._stop_condition.wait(), self._tasks.join()] self._shutdown_condition.set() def _prepare_shutdown(self): """ Check if ioloop can be stopped Returns: None """ if any(task.is_running() for task in self._cron_tasks.values()): IOLoop.current().add_callback(self._prepare_shutdown) return self._stop_condition.set() def clear(self): """ Clear list of tasks Returns: None """ self._cron_tasks = {} self._shutdown_condition.clear() self._stop_condition.clear() async def process_tasks(self, number): """ Execute queue. Every task in executed in separated thread (_Executor) """ log.info("Starting worker %s", number) while True: try: item = self._tasks.get_nowait() try: log.debug("Worker %s: starting %s", number, item) thread = _Executor(task=item, number=number) self._task_workers[number] = thread thread.start() while thread.is_alive(): await sleep(0.5) except: log.exception("Worker %s: exception occurred", number) finally: log.debug("Worker %s: %s finished", number, item) self._tasks.task_done() tasks_per_scan = ( '{}: {}'.format(scanner, len(tasks)) for scanner, tasks in self.tasks_by_scan.items()) log.debug("Tasks left in queue: %s (%s)", self.unfinished_tasks, ', '.join(tasks_per_scan)) self._task_workers[number] = None except QueueEmpty: await gen.sleep(0.5) if self._stop_condition.is_set() and self._tasks.empty(): return finally: if self._limit < len(self._task_workers): break del self._task_workers[number] log.info("Closing worker %s", number) def add_task(self, task): """ Add task to the queue Args: task: Returns: None """ self._tasks.put(task) @property def unfinished_tasks(self): """ Task which are still processed or in queue Returns: int """ return self._tasks._unfinished_tasks @property def tasks_by_scan(self): """ Returns queued tasks grouped by scan """ tasks = self._tasks._queue return_value = {} for task in tasks: return_value.setdefault(task.context.scanner.NAME, []).append(task) return return_value @property def cron_tasks(self): """ List of cron tasks Returns: list """ return self._cron_tasks.values() def cron_task(self, name): for task in self._cron_tasks.values(): if task.func.NAME == name: return task def change_throttling_toucan(self, key, value): self.change_throttling(value) def change_throttling(self, new_value): """ Change throttling value. Keeps throttling value between 0 and 1. Behaviour of algorithm is described in docs/throttling.md Only working tasks are closing here. Idle workers are stop by themselves """ if new_value > 1: new_value = 1 if new_value < 0: new_value = 0 new_value = round(new_value * 100) / 100 old_limit = self._limit self._limit = round(self._parallel_tasks * float(new_value)) working_tasks = [ number for number, task in self._task_workers.items() if task is not None ] current_tasks = len(self._task_workers) task_politic = cfg['service.scans.task_politic'] if task_politic == self.TASKS_POLITIC_KILL_WORKING_FIRST: tasks_to_kill = current_tasks - self._limit elif task_politic == self.TASKS_POLITIC_KILL_PROPORTIONS: tasks_to_kill = round((old_limit - self._limit) * len(working_tasks) / self._parallel_tasks) elif task_politic == self.TASKS_POLITIC_KILL_WORKING: tasks_to_kill = (old_limit - self._limit) - ( len(self._task_workers) - len(working_tasks)) else: tasks_to_kill = 0 log.debug('%s tasks will be killed', tasks_to_kill) for number in working_tasks: if tasks_to_kill <= 0: break self._task_workers[number].stop() tasks_to_kill -= 1 self._limit = round(self._parallel_tasks * float(new_value)) current_tasks = len(self._task_workers) for number in range(self._limit - current_tasks): self._task_workers[self._next_task_number] = None IOLoop.current().add_callback( partial(self.process_tasks, self._next_task_number)) self._next_task_number += 1
class Scraper(): def __init__( self, destinations=None, transform=None, headers={}, max_clients=50, maxsize=50, connect_timeout=1200, request_timeout=600,): """Instantiate a tornado async http client to do multiple concurrent requests""" if None in [destinations, transform]: sys.stderr.write('You must pass both collection of URLS and a transform function') raise SystemExit self.max_clients = max_clients self.maxsize = maxsize self.connect_timeout = connect_timeout self.request_timeout = request_timeout AsyncHTTPClient.configure("tornado.simple_httpclient.SimpleAsyncHTTPClient", max_clients=self.max_clients) self.http_client = AsyncHTTPClient() self.queue = Queue(maxsize=50) self.destinations = destinations self.transform = transform self.headers = headers self.read(self.destinations) self.get(self.transform, self.headers, self.connect_timeout, self.request_timeout, self.http_client) self.loop = ioloop.IOLoop.current() self.join_future = self.queue.join() def done(future): self.loop.stop() self.join_future.add_done_callback(done) self.loop.start() @gen.coroutine def read(self, destinations): for url in destinations: yield self.queue.put(url) @gen.coroutine def get(self, transform, headers, connect_timeout, request_timeout, http_client): while True: url = yield self.queue.get() try: request = HTTPRequest(url, connect_timeout=connect_timeout, request_timeout=request_timeout, method="GET", headers = headers ) except Exception as e: sys.stderr.write('Destination {0} returned error {1}'.format(url, str(e) + '\n')) future = self.http_client.fetch(request) def done_callback(future): body = future.result().body url = future.result().effective_url transform(body, url=url) self.queue.task_done() try: future.add_done_callback(done_callback) except Exception as e: sys.stderr.write(str(e)) queue.put(url)
class BaseSpider(object): url_parser = None def __init__(self, engine, concurrent=3): self.engine = engine self.http = httpclient.AsyncHTTPClient() self.queue = Queue() self.concurrency = concurrent @property def hostname(self): return self.url_parser.hostname @property def url_root(self): return self.url_parser.url_root @property def base_url(self): return self.url_parser.base_url @gen.coroutine def __worker(self): """Consumes the queue.""" while True: yield self.fetch_url() @gen.coroutine def crawl(self, description, location): """Starts crawling the specified URL.""" url = self.url_parser(description, location) self.queue.put(url) self.engine.notify_started(self) for _ in range(self.concurrency): self.__worker() yield self.queue.join() self.engine.notify_finished(self) @gen.coroutine def fetch_url(self): """Retrieves a URL from the queue and returns the parsed data.""" url = yield self.queue.get() logger.info('fetching %s' % url) try: response = yield self.http.fetch(url) soup = BeautifulSoup(response.body) logger.info('got response %s' % url) urls = yield self.fetch_links(response, soup) for new_url in urls: logger.debug('Added %s to queue' % new_url) yield self.queue.put(new_url) data = yield self.parse_response(response, soup) logger.info('Parsed response for %s' % url) except (httpclient.HTTPError, ValueError): message = 'HTTP Error: (%s)' % url self.engine.write_message(message, self.engine.STATUS_ERROR) else: self.engine.write_data(data) finally: self.queue.task_done() @gen.coroutine def fetch_links(self, response, soup): """Fetch URLs to be added to the queue.""" raise gen.Return([]) def parse_response(self, response, soup): """Extract information from the response, return should be a list of dict's. Sample dict: { 'title': 'Job Title', 'company': 'Company Name', 'location': 'City/State/Country', 'tags': ['tag1', 'tag2', 'tag3'], 'category': 'Software Developer', 'origin': 'Name of the origin website', 'url': 'Link to the complete job description', } """ raise NotImplementedError
class BatchedStream(object): """ Mostly obsolete, see BatchedSend """ def __init__(self, stream, interval): self.stream = stream self.interval = interval / 1000. self.last_transmission = default_timer() self.send_q = Queue() self.recv_q = Queue() self._background_send_coroutine = self._background_send() self._background_recv_coroutine = self._background_recv() self._broken = None self.pc = PeriodicCallback(lambda: None, 100) self.pc.start() @gen.coroutine def _background_send(self): with log_errors(): while True: msg = yield self.send_q.get() if msg == 'close': break msgs = [msg] now = default_timer() wait_time = self.last_transmission + self.interval - now if wait_time > 0: yield gen.sleep(wait_time) while not self.send_q.empty(): msgs.append(self.send_q.get_nowait()) try: yield write(self.stream, msgs) except StreamClosedError: self.recv_q.put_nowait('close') self._broken = True break if len(msgs) > 1: logger.debug("Batched messages: %d", len(msgs)) for _ in msgs: self.send_q.task_done() @gen.coroutine def _background_recv(self): with log_errors(): while True: try: msgs = yield read(self.stream) except StreamClosedError: self.recv_q.put_nowait('close') self.send_q.put_nowait('close') self._broken = True break assert isinstance(msgs, list) if len(msgs) > 1: logger.debug("Batched messages: %d", len(msgs)) for msg in msgs: self.recv_q.put_nowait(msg) @gen.coroutine def flush(self): yield self.send_q.join() @gen.coroutine def send(self, msg): if self._broken: raise StreamClosedError('Batch Stream is Closed') else: self.send_q.put_nowait(msg) @gen.coroutine def recv(self): result = yield self.recv_q.get() if result == 'close': raise StreamClosedError('Batched Stream is Closed') else: raise gen.Return(result) @gen.coroutine def close(self): yield self.flush() raise gen.Return(self.stream.close()) def closed(self): return self.stream.closed()
class BlogBackup(object): _default_dir_name = "seg_blog_backup" def _generate_save_dir(self): cur_dir = os.path.dirname(__file__) self.save_path = os.path.join(cur_dir, self._default_dir_name) if not os.path.isdir(self.save_path): os.mkdir(self.save_path) def _parse_save_path(self): if self.save_path: if os.path.exists(self.save_path) and os.path.isdir(self.save_path): return else: raise BlogSavePathError("'%s' not exists or is not dir!" % self.save_path) else: self._generate_save_dir() @staticmethod def parse_token_from_html(content): overall_pat = re.compile(r"SF.token =.*?,\s+_\w+ = [\d,\[\]]+;", re.DOTALL) overall_res = overall_pat.search(content) if overall_res: overall_content = overall_res.group() # remove /* */ type annotation filter_res = re.sub(r"(/\*[/a-zA-Z\d' ]+\*/)", "", overall_content) str_list = re.findall(r"(?<!//)'([a-zA-Z\d]+)'", filter_res, re.DOTALL) filter_list = re.findall(r"\[(\d+),(\d+)\]", overall_content) ret = "".join(str_list) if filter_list: for m, n in filter_list: ret = ret[: int(m)] + ret[int(n) :] if len(ret) == 32: return ret raise PageHtmlChanged("website login token has changed") def _get_user_cookies(self): s = requests.Session() s.headers.update(headers) rep = s.get(target_url) post_url = "%s%s?_=%s" % (target_url, login_api_path, self.parse_token_from_html(rep.text)) data = {"mail": self.username, "password": self.passwd} s.post(post_url, data=data) return s.cookies def __init__(self, **conf): self.username = conf["username"] self.passwd = conf["passwd"] self.save_path = conf.get("save_path") self._q = Queue() self._cookies = self._get_user_cookies() self._parse_save_path() @gen.coroutine def run(self): start_url = target_url + blog_path yield self._fetch_blog_list_page(start_url) for _ in xrange(cpu_count()): self._fetch_essay_content() yield self._q.join() @gen.coroutine def _fetch_blog_list_page(self, page_link): ret = requests.get(page_link, cookies=self._cookies) d = pq(ret.text) link_elements = d(".stream-list__item > .summary > h2 > a") for link in link_elements: yield self._q.put(d(link).attr("href")) next_ele = d(".pagination li.next a") if next_ele: next_page_url = target_url + next_ele.attr("href") self._fetch_blog_list_page(next_page_url) @gen.coroutine def _fetch_essay_content(self): while True: try: essay_path = yield self._q.get(timeout=1) essay_url = target_url + essay_path + edit_suffix ret = requests.get(essay_url, cookies=self._cookies) d = pq(ret.text) title = d("#myTitle").val() content = d("#myEditor").text() real_file_name = os.path.join(self.save_path, title + ".md") logger.info("is backup essay: %s" % title) with open(real_file_name, "w") as f: f.writelines(content.encode("utf8")) except gen.TimeoutError: raise gen.Return() finally: self._q.task_done()
class BlogBackup(object): _default_dir_name = 'seg_blog_backup' def _generate_save_dir(self): cur_dir = os.path.dirname(__file__) self.save_path = os.path.join(cur_dir, self._default_dir_name) if not os.path.isdir(self.save_path): os.mkdir(self.save_path) def _parse_save_path(self): if self.save_path: if os.path.exists(self.save_path) and \ os.path.isdir(self.save_path): return else: raise BlogSavePathError( "'%s' not exists or is not dir!" % self.save_path) else: self._generate_save_dir() def _get_user_cookies(self): url = target_url + login_page_path self.driver.get(url) try: user_input = self.driver.find_element_by_name('mail') passwd_input = self.driver.find_element_by_name('password') submit_btn = self.driver.find_element_by_class_name('pr20') except NoSuchElementException: raise PageHtmlChanged( "%s login page structure have changed!" % _domain) user_input.send_keys(self.username) passwd_input.send_keys(self.passwd) submit_btn.click() try: WebDriverWait(self.driver, 3).until(staleness_of(submit_btn)) except TimeoutException: raise Exception("Wrong username or password!") WebDriverWait(self.driver, timeout=10).until(has_page_load) try_times = 0 while True: time.sleep(1) if url != self.driver.current_url: return self.driver.get_cookies() try_times += 1 if try_times > 10: raise Exception("Getting cookie info failed!") def _get_driver(self): if self.phantomjs_path: try: return webdriver.PhantomJS( executable_path=self.phantomjs_path, service_log_path=os.path.devnull) except WebDriverException: raise PhantomjsPathError("Phantomjs locate path invalid!") else: return webdriver.PhantomJS(service_log_path=os.path.devnull) def __init__(self, **conf): self.username = conf['username'] self.passwd = conf['passwd'] self.phantomjs_path = conf.get('phantomjs_path') self.save_path = conf.get('save_path') self._q = Queue() self._parse_save_path() self.driver = self._get_driver() self._cookies = self._get_user_cookies() @gen.coroutine def run(self): self.__filter_cookies() start_url = target_url + blog_path yield self._fetch_blog_list_page(start_url) for _ in xrange(cpu_count()): self._fetch_essay_content() yield self._q.join() def __filter_cookies(self): self._cookies = {k['name']: k['value'] for k in self._cookies if k['domain'] == _domain} @gen.coroutine def _fetch_blog_list_page(self, page_link): ret = requests.get(page_link, cookies=self._cookies) d = pq(ret.text) link_elements = d('.stream-list__item > .summary > h2 > a') for link in link_elements: yield self._q.put(d(link).attr('href')) next_ele = d('.pagination li.next a') if next_ele: next_page_url = target_url + next_ele.attr('href') self._fetch_blog_list_page(next_page_url) @gen.coroutine def _fetch_essay_content(self): while True: try: essay_path = yield self._q.get(timeout=1) essay_url = target_url + essay_path + edit_suffix ret = requests.get(essay_url, cookies=self._cookies) d = pq(ret.text) title = d("#myTitle").val() content = d("#myEditor").text() file_name = title + '.md' real_file_name = os.path.join(self.save_path, file_name) with open(real_file_name, 'w') as f: f.writelines(content.encode('utf8')) except gen.TimeoutError: raise gen.Return() finally: self._q.task_done()
def main(): # Start consumer without waiting # Tornado framework used for async IO # http://www.tornadoweb.org/en/stable/index.html q = Queue() @gen.coroutine def consumer(): item = yield q.get() try: code = False try: response = yield httpclient.AsyncHTTPClient().fetch(item) codes = ['200', '301', '302'] code = any(s in response.headers['Status'] for s in codes) rcode = response.code if DEBUG: fname = re.match(r'http://([\w+|.]+)/',item).group(1) fname = os.path.join(DEBUG_DIR,fname.replace(".","_")) with open(fname, 'w') as f: for k,v in response.headers.get_all(): f.write(k+' '+v+'\n') f.write('\n') f.write(response.body) f.close() except Exception as e: code = False rcode = str(e) print('%s,%s,%s,"%s"' % (datetime.now(), item, code, rcode)) # Append to DOMAINS found URL if code: DOMAINS[RESULT[item]].append(item) finally: q.task_done() @gen.coroutine def worker(): while True: yield consumer() @gen.coroutine def producer(): if DEBUG and not os.path.exists(DEBUG_DIR): print('Creating debug out dir: %s' % DEBUG_DIR) os.makedirs(DEBUG_DIR) # Open and process file if supplied if len(sys.argv) >= 2: with open(sys.argv[1]) as f: for line in f: DOMAINS[line.strip()]= [] else: print("Domains list file wasn't provided") print("Usage: %s <domains.txt> [ report.txt ]" % sys.argv[0]) sys.exit(2) # Generate processing list for d in DOMAINS.keys(): for url in generate_url_list(d): q.put(url) yield producer()# Wait for producer to put all tasks. # Start workers, then wait for the work queue to be empty. for _ in range(concurrency): worker() yield q.join() # Wait for consumer to finish all tasks. # Out results if len(sys.argv) >= 3: f = open(sys.argv[2],'w') else: f = sys.stdout for key, val in DOMAINS.items(): if DOMAINS[key]: DOMAINS[key] = '"'+" ".join(val)+'"' else: DOMAINS[key] = 'No' out = "\n".join([",".join([key, str(val)]) for key, val in DOMAINS.items()]) + '\n' f.write(out)
class TaskLogger(object): def __init__(self, task_id, engine=EngineType.REQUESTS, io_loop=None, task_url=TASK_URL, wrap=False, tenant=None): self.task_id = task_id self.task_url = task_url self._seq = 0 self._partial_log_url = self._get_partial_url('log') self._partial_result_url = self._get_partial_url('result') self.wrap = wrap if wrap and tenant: self._partial_log_url = update_query_params( self._partial_log_url, {'tenant': tenant}) self._partial_result_url = update_query_params( self._partial_result_url, {'tenant': tenant}) if engine == EngineType.REQUESTS: self.log = self._log_by_requests self.result = self._result_by_requests elif engine == EngineType.TORNADO: io_loop = io_loop if io_loop else IOLoop.current() self._http_client = AsyncHTTPClient(io_loop=io_loop) self._queue = Queue() self.log = self._log_by_tornado self.result = self._result_by_tornado else: raise TaskLoggerError('', reason='engine only supports {}'.format( EngineType.types_str())) def _get_partial_url(self, partial_name): url = urljoin(self.task_url, partial_name) url = update_query_params(url, {'task_id': self.task_id}) return url def _get_log_url(self, seq): url = update_query_params(self._partial_log_url, {'seq': seq}) return url def _get_result_url(self, seq, exit_code=0): url = update_query_params(self._partial_result_url, { 'seq': seq, 'exit_code': exit_code }) return url def _log_by_requests(self, log): self._seq += 1 log_url = self._get_log_url(self._seq) data = self._create_log(log, self._seq) self._send_by_requests(log_url, data) def _result_by_requests(self, result, exit_code=0): self._seq += 1 result_url = self._get_result_url(self._seq, exit_code) data = self._create_result(result, self._seq, exit_code=exit_code) self._send_by_requests(result_url, data) @staticmethod def _send_by_requests(url, data): res = requests.post(url, data=data, verify=False) if res.status_code != 200: raise TaskLoggerError(data, reason=res.reason) @gen.coroutine def _log_by_tornado(self, log): yield self._queue.put(1) self._seq += 1 log_url = self._get_log_url(self._seq) data = self._create_log(log, self._seq) try: yield self._send_by_tornado(log_url, data) finally: yield self._queue.get() self._queue.task_done() @gen.coroutine def _result_by_tornado(self, result, exit_code=0): yield self._queue.join() self._seq += 1 result_url = self._get_result_url(self._seq, exit_code) data = self._create_result(result, self._seq, exit_code=exit_code) yield self._send_by_tornado(result_url, data) @gen.coroutine def _send_by_tornado(self, url, data): try: response = yield self._http_client.fetch( url, method='POST', headers={'Content-Type': 'application/json'}, validate_cert=False, body=data) except Exception as exc: if hasattr(exc, 'response') and exc.response: exc = 'url:{}, exc:{}, body:{}'.format(url, exc, exc.response.body) raise TaskLoggerError(data, str(exc)) else: if response.code != 200: raise TaskLoggerError(data, reason=response.body) def _create_log(self, log, seq): assert isinstance(log, basestring) log = log + '\n' if self.wrap: log_msg = TaskLogMessage(task_id=self.task_id, log=log, seq=seq) data = json_encode({'messages': log_msg}) else: data = log return data def _create_result(self, result, seq, exit_code): assert isinstance(result, basestring) result = result + '\n' if self.wrap: result_msg = TaskResultMessage(task_id=self.task_id, result=result, seq=seq, exit_code=exit_code) data = json_encode({'messages': result_msg}) else: data = result return data
class TornadoPikaPublisher(BeergardenPublisher, PikaClient): def __init__(self, **kwargs): self.logger = logging.getLogger(__name__) self._shutdown_timeout = timedelta( seconds=kwargs.pop('shutdown_timeout', 5)) self._work_queue = Queue() self._connection = None self._channel = None self.coroutiner = CoroutineMaker({ 'TornadoConnection': 'on_open_callback', 'channel': 'on_open_callback' }) # Trying to get super() to work with incompatible signatures is a nightmare BeergardenPublisher.__init__(self) PikaClient.__init__(self, **kwargs) IOLoop.current().spawn_callback(self._process) def shutdown(self): return self._work_queue.join(timeout=self._shutdown_timeout) @coroutine def _open_connection(self): self._connection = yield self.coroutiner.convert(TornadoConnection)( parameters=self._conn_params, stop_ioloop_on_close=False) @coroutine def _open_channel(self): self._channel = yield self.coroutiner.convert( self._connection.channel)() @coroutine def _process(self): while True: item = yield self._work_queue.get() try: if not self._connection or not self._connection.is_open: yield self._open_connection() if not self._channel or not self._channel.is_open: yield self._open_channel() yield getattr(self._channel, item[0])(**item[1]) finally: self._work_queue.task_done() def publish(self, message, **kwargs): """Publish a message. :param message: The message to publish :param kwargs: Additional message properties :Keyword Arguments: * *routing_key* -- Routing key to use when publishing * *headers* -- Headers to be included as part of the message properties * *expiration* -- Expiration to be included as part of the message properties :return: None """ self._work_queue.put(('basic_publish', { 'exchange': self._exchange, 'routing_key': kwargs['routing_key'], 'body': message, 'properties': BasicProperties(app_id='beer-garden', content_type='text/plain', headers=kwargs.pop('headers', None), expiration=kwargs.pop('expiration', None)) })) def _event_publish_args(self, event, **kwargs): # Main thing we need to do here is figure out the appropriate routing key args = {} if event.metadata and 'routing_key' in event.metadata: args['routing_key'] = event.metadata['routing_key'] elif 'request' in kwargs: request = kwargs['request'] args['routing_key'] = get_routing_key('request', request.system, request.system_version, request.instance_name) else: args['routing_key'] = 'beergarden' return args
class BatchedStream(object): """ Mostly obsolete, see BatchedSend """ def __init__(self, stream, interval): self.stream = stream self.interval = interval / 1000.0 self.last_transmission = default_timer() self.send_q = Queue() self.recv_q = Queue() self._background_send_coroutine = self._background_send() self._background_recv_coroutine = self._background_recv() self._broken = None self.pc = PeriodicCallback(lambda: None, 100) self.pc.start() @gen.coroutine def _background_send(self): with log_errors(): while True: msg = yield self.send_q.get() if msg == "close": break msgs = [msg] now = default_timer() wait_time = self.last_transmission + self.interval - now if wait_time > 0: yield gen.sleep(wait_time) while not self.send_q.empty(): msgs.append(self.send_q.get_nowait()) try: yield write(self.stream, msgs) except StreamClosedError: self.recv_q.put_nowait("close") self._broken = True break if len(msgs) > 1: logger.debug("Batched messages: %d", len(msgs)) for _ in msgs: self.send_q.task_done() @gen.coroutine def _background_recv(self): with log_errors(): while True: try: msgs = yield read(self.stream) except StreamClosedError: self.recv_q.put_nowait("close") self.send_q.put_nowait("close") self._broken = True break assert isinstance(msgs, list) if len(msgs) > 1: logger.debug("Batched messages: %d", len(msgs)) for msg in msgs: self.recv_q.put_nowait(msg) @gen.coroutine def flush(self): yield self.send_q.join() @gen.coroutine def send(self, msg): if self._broken: raise StreamClosedError("Batch Stream is Closed") else: self.send_q.put_nowait(msg) @gen.coroutine def recv(self): result = yield self.recv_q.get() if result == "close": raise StreamClosedError("Batched Stream is Closed") else: raise gen.Return(result) @gen.coroutine def close(self): yield self.flush() raise gen.Return(self.stream.close()) def closed(self): return self.stream.closed()
class SQSSource(object): """Implementation of ISource that receives messages from a SQS queue. """ max_delete_delay = 5 def __init__(self, logger, loop, gate, sqs_client, metric_prefix='source'): self.gate = gate self.collector = sqs_client self.logger = logger self.loop = loop self.metric_prefix = metric_prefix self.end_of_input = Event() self.input_error = Event() self.state = RUNNING self._delete_queue = Queue() self._should_flush_queue = Event() self.sender_tag = 'sender:%s.%s' % (self.__class__.__module__, self.__class__.__name__) self.loop.spawn_callback(self.onInput) self.loop.spawn_callback(self._onDelete) @gen.coroutine def close(self, timeout=None): self.state = CLOSING self.logger.warning('Closing source') yield self._delete_queue.join(timeout) @gen.coroutine def _flush_delete_batch(self, batch_size): delete_batch = [ self._delete_queue.get_nowait() for pos in range(min(batch_size, self.collector.max_messages)) ] try: response = yield self.collector.delete_message_batch(*delete_batch) except SQSError as err: lmsg = 'Error encountered deleting processed messages in SQS: %s' self.logger.exception(lmsg, err) self.input_error.set() for msg in delete_batch: self._delete_queue.put_nowait(msg) else: if response.Failed: self.input_error.set() for req in response.Failed: self.logger.error('Message failed to delete: %s', req.Id) self._delete_queue.put_nowait(req) @gen.coroutine def _onDelete(self): respawn = True while respawn: try: qsize = self._delete_queue.qsize() # This will keep flushing until clear, # including items that show up in between flushes while qsize > 0: yield self._flush_delete_batch(qsize) qsize = self._delete_queue.qsize() self._should_flush_queue.clear() yield self._should_flush_queue.wait() except Exception as err: self.logger.exception(err) self.input_error.set() respawn = False @gen.coroutine def onInput(self): respawn = True retry_timeout = INITIAL_TIMEOUT # We use an algorithm similar to TCP window scaling, # so that we request fewer messages when we encounter # back pressure from our gate/drain and request more # when we flushed a complete batch window_size = self.collector.max_messages while respawn: try: response = yield self.collector.receive_message_batch( max_messages=window_size, ) if response.Messages: # We need to have low latency to delete messages # we've processed retry_timeout = INITIAL_TIMEOUT else: retry_timeout = min(retry_timeout * 2, MAX_TIMEOUT) yield gen.sleep(retry_timeout.total_seconds()) sent_full_batch = True for position, msg in enumerate(response.Messages): try: self.gate.put_nowait(msg) except QueueFull: self.logger.debug('Gate queue full; yielding') sent_full_batch = False # TODO: is it worth trying to batch and schedule # a flush at this point instead of many # single deletes? yield self.gate.put(msg) self._should_flush_queue.set() self._delete_queue.put_nowait(msg) statsd.increment('%s.queued' % self.metric_prefix, tags=[self.sender_tag]) # If we were able to flush the entire batch without waiting, # increase our window size to max_messages if sent_full_batch and \ window_size < self.collector.max_messages: window_size += 1 # Otherwise ask for less next time elif not sent_full_batch and window_size > 1: window_size -= 1 except Exception as err: self.logger.exception(err) self.input_error.set() respawn = False