def get_data(cls, account, source_filter, limit=100, skip=0): # set up our queue and semaphore queue = JoinableQueue() sem = BoundedSemaphore(FETCH_CONCURRENCY) done, working = set(), set() data = [] # set up our coroutine to fetch our pages @gen.coroutine def fetch_url(): current_url = yield queue.get() try: if current_url in working: return page_no = working.__len__() # add the url we're fetching to our working set working.add(current_url) # and get it req = account.get_request(current_url) client = AsyncHTTPClient() response = yield client.fetch(req) # now we add our url to the set of done pages done.add(current_url) # and append the data we've received response_data = json.loads(response.body.decode('utf-8')) data.__add__(response_data.get('items', [])) # check to see if there is a next page url = response_data.get('@odata.nextLink', None) if url is not None: # and if there is one, stuff it in the queue queue.put(url) finally: # ...and finally signal that we're done and release our semaphore queue.task_done() sem.release() # and set up the coroutine for our worker @gen.coroutine def worker(): while True: yield sem.acquire() fetch_url() # the urls we will be fetching data from uris = ['http://some_paginated_odata_api/example/', 'http://some_paginated_odata_api/example2/'] # fill our queue for uri in uris: queue.put(url) # start our queue worker worker() # wait until we're done yield queue.join(deadline=timedelta(seconds=MAXIMUM_REQ_TIME)) # this helper function will "format" our data according to the schema we've specified above formatted = cls.format_data_to_schema(data) # and we're done return json.dumps(formatted)
def __init__(self, server, conn, stream, io_loop=None, *args, **kwargs): self.server = server self.stream = stream self.conn = conn(self, main_server_ip=self.server.ipaddr, main_server_port=self.server.port,) self.session_key = None self.state = CONNECTING self.io_loop = io_loop self.stream.set_close_callback(self.socket_closed) self.job_queue = JoinableQueue() self.registered_cmds = {} self.cnt_number = '0000' self.io_loop.add_future(self.init_workflow(), lambda future: future.result()) super(TerminalSession, self).__init__(*args, **kwargs)
def __init__(self, start_url, sitemap_url=None, max_concurrent_connections=MAX_CONCURRENT_REQUESTS_PER_SERVER): self.visited_urls = set() self.intermediate_urls = set() self.base_domain = extract_domain(start_url) self.base_site = extract_base_site(start_url) self.base_page = _get_client_page(start_url, None, start_url, self.base_domain, DOMAINS_TO_BE_SKIPPED) self.non_visited_urls = {self.base_page} self.added_count = 1 self.idle_ping = 0 self.start_idle_counter = False self.sitemap_url = u'{}/sitemap.xml'.format(self.base_site) if not sitemap_url else sitemap_url self.max_concurrent_connections = max_concurrent_connections self.page_queue = JoinableQueue() self.semaphore = BoundedSemaphore(self.max_concurrent_connections) self.start = time.time() self.skip_count = 0
class TerminalSession(RTOManager, QueclinkProtocol): r"""Base session implementation class. Session is shared object and low-level code for connection. Parameters ---------- session_key - IMEI of device """ STOP_FLAG = False def __init__(self, server, conn, stream, io_loop=None, *args, **kwargs): self.server = server self.stream = stream self.conn = conn(self, main_server_ip=self.server.ipaddr, main_server_port=self.server.port,) self.session_key = None self.state = CONNECTING self.io_loop = io_loop self.stream.set_close_callback(self.socket_closed) self.job_queue = JoinableQueue() self.registered_cmds = {} self.cnt_number = '0000' self.io_loop.add_future(self.init_workflow(), lambda future: future.result()) super(TerminalSession, self).__init__(*args, **kwargs) def is_open(self): return self.state == OPEN def is_closed(self): return self.state == CLOSED @gen.coroutine def open(self): """Opens the connect. According to the protocol, we should first configure the device and after than connection should be flagged as opened.""" # unique_id = yield self.conn.configure() log = yield self.conn.verify_conn() unique_id = log.unique_id gen_log.info('CONNECTION OPENED WITH: %s' % unique_id) if unique_id: self.session_key = unique_id self.state = OPEN self.conn.on_open(unique_id) raise gen.Return(unique_id) raise gen.Return(conf.DISCONN_RESULT) @gen.coroutine def read_message(self): r"""Callbacks takes two arguments: binary data and job code.""" if self.stream.closed(): raise StreamClosedError("Stream is closed") message = yield gen.Task(self.stream.read_until, conf.END_SIGN) raise gen.Return(message) @gen.coroutine def terminal_message_flow(self, msg): r"""Sets message flow""" try: log, sack, from_buffer = super( TerminalSession, self).terminal_message_flow(msg) except MessageNotImplemented as e: # silence exc gen_log.exception(e) return count_num = log.log.count_number if log.type == conf.ACK: if not log.header == conf.HEARTBEAT_ACK: self.unregister_command_on_ack(log.log) yield self.conn.on_ack(msg, log, sack) else: if self.is_pending_rto(log.header): self.make_rto_response(log) # bad accuracy of gps # may be warn by email our guys that gps accuracy is weak # skip message logic if log.header == conf.FIXED_REPORT: gps_accuracy = int(log.log.gps_accuracy) if not gps_accuracy: # or (20 < gps_accuracy <= 50): self.skip_message = True else: self.skip_message = False if getattr(self, 'skip_message', False): gen_log.info("Hey, GPS ACCURACY IS BAD") return yield self.conn.on_report(msg, log, sack, from_buffer=from_buffer) if not self.session_key and hasattr(log.log, 'unique_id'): self.session_key = log.log.unique_id self.state = OPEN raise gen.Return(count_num) def exec_command(self, msg_tp, header, body): r"""Sends message to the end-point. Returns promiseable future.""" if not msg_tp == conf.COMMAND: raise serial_number, cmd_future = self.register_command_for_ack() body.update({'serial_number': serial_number}) self.send_message(self.build_cmd(header, **body)) return cmd_future def send_message(self, msg): self.stream.write(msg) gen_log.info("SACK: %s", msg) def should_stop(self): return self.STOP_FLAG @gen.coroutine def init_workflow(self): schedule_at_loop(self.io_loop, self._tail_messagebus, callback=self._handle_message_flow) schedule_at_loop(self.io_loop, self._tail_stream_buffer, callback=self._handle_message_flow) @gen.coroutine def _tail_messagebus(self): def job_complete(f): self.cnt_number = f.result() while True: if self.should_stop(): break message = yield self.job_queue.get() schedule_at_loop(self.io_loop, self.terminal_message_flow(message), job_complete) self.job_queue.task_done() gen_log.info("INCOMING MSG: %s", message) @gen.coroutine def _tail_stream_buffer(self): while True: if self.should_stop(): break message = yield self.read_message() yield self.job_queue.put(message) def _handle_message_flow(self, future): # some other errors that I do not know yet that can # lead to memory leaks due to file descriptor will no be closed try: future.result() except Exception as e: app_log.exception(e) finally: self.STOP_FLAG = True self.close() def register_command_for_ack(self): r"""Returns serial number and future promise""" serial_number = '' while not serial_number or serial_number in self.registered_cmds: serial_number = generate_random_hex() f = self.registered_cmds[serial_number] = Future() return serial_number, f def unregister_command_on_ack(self, log): serial_number = log.serial_number future = self.registered_cmds.pop(serial_number, None) if not future: return if future._done: return future if not isinstance(log, Exception): future.set_result(log) else: future.set_exception(log) def unregister_commands(self): if not self.is_open(): for evt in self.registered_cmds.keys(): f = self.registered_cmds.pop(evt) f.set_result(conf.DISCONN_RESULT) while self.registered_cmds: self.registered_cmds def socket_closed(self): self.state = CLOSING self.close() def close(self): self.unregister_commands() self.server.close_session(self.session_key) self.conn.on_close() self.stream.close() self.state = CLOSED gen_log.info("CONNECTION CLOSED: %s", self.session_key)
class TornadoSpider: def __init__(self, start_url, sitemap_url=None, max_concurrent_connections=MAX_CONCURRENT_REQUESTS_PER_SERVER): self.visited_urls = set() self.intermediate_urls = set() self.base_domain = extract_domain(start_url) self.base_site = extract_base_site(start_url) self.base_page = _get_client_page(start_url, None, start_url, self.base_domain, DOMAINS_TO_BE_SKIPPED) self.non_visited_urls = {self.base_page} self.added_count = 1 self.idle_ping = 0 self.start_idle_counter = False self.sitemap_url = u'{}/sitemap.xml'.format(self.base_site) if not sitemap_url else sitemap_url self.max_concurrent_connections = max_concurrent_connections self.page_queue = JoinableQueue() self.semaphore = BoundedSemaphore(self.max_concurrent_connections) self.start = time.time() self.skip_count = 0 @coroutine def initiate_crawl(self): self.non_visited_urls.add(self.base_page) self.add_sitemap_urls(self.base_page) self.page_queue.put(self.base_page) self._crawl_web_page() yield self.page_queue.join() @coroutine def _crawl_web_page(self): while True: if len(self.intermediate_urls) < 5 and self.start_idle_counter: print("Unprocessed urls : ") for page in self.intermediate_urls: print(u'>>>>>> %s ' % page.encoded_url) # print("Available Semaphore %s" % self.semaphore.counter) yield self.semaphore.acquire() # print("0.Issued Semaphore %s " % (self.semaphore.counter+1)) self._fetch_page(self.semaphore.counter + 1) if len(self.intermediate_urls) < 5 and self.start_idle_counter: print("Unprocessed urls : ") for page in self.intermediate_urls: print(u'>> %s ' % page.encoded_url) self.wrap_up() @coroutine def _fetch_page(self, semaphore_count): try: page = yield self.page_queue.get() if page in self.visited_urls or page in self.intermediate_urls: return if page.skip_page(): self.skip_count += 1 logger.debug("Skipped {} " % page.url) return logger.debug( u"1.Sempahore in use> %s int.count %s for %s" % (semaphore_count, len(self.intermediate_urls), page.encoded_url)) self.intermediate_urls.add(page) page.process(self) response = yield page.make_head_request() get_response = yield page._process_head_response(response) if get_response: page.process_get_response(get_response) print( u"Total urls added : {} , Total urls visited : {} , Total urls in process : {} Skipped : {}," u" semaphore used : {} " \ .format(self.added_count, len(self.visited_urls), len(self.intermediate_urls), self.skip_count, semaphore_count)) logger.debug( u"Total urls added : {} , Total urls visited : {} , Total urls in process : {} Skipped : {}, " u"semaphore {}" .format(self.added_count, len(self.visited_urls), len(self.intermediate_urls), self.skip_count, self.semaphore.counter)) except Exception as ex: logger.debug(ex) finally: self.page_queue.task_done() self.semaphore.release() logger.debug( u"2.Sempahore returned>> %s available %s after %s" % (semaphore_count, self.semaphore.counter, page.encoded_url)) def _filter_visited_links(self, page): return page not in self.visited_urls and page not in self.intermediate_urls and page not in self.non_visited_urls def add_sitemap_urls(self, parent_page): logger.debug("Adding sitemap urls as well for processing") http_client = HTTPClient() try: response = http_client.fetch(self.sitemap_url) val = bytes(response.body) root = objectify.fromstring(val) for url_element in root.url: page = _get_client_page(decode_to_unicode(url_element.loc.text), parent_page, self.base_site, self.base_domain, DOMAINS_TO_BE_SKIPPED) if page not in self.visited_urls and page not in self.non_visited_urls \ and page not in self.intermediate_urls: print(u"Added {}".format(url_element.loc)) self.non_visited_urls.add(page) self.added_count += 1 self.page_queue.put(page) except Exception as e: logger.error(u"Error adding sitemap urls from %s " % self.sitemap_url) finally: http_client.close() def _get_unique_non_visited_links(self, page): l = Lock() l.acquire() filtered_links = set(filter(self._filter_visited_links, page.links)) l.release() return filtered_links def process_web_page(self, web_page): logger.debug(u"Called {} for {}".format('process_web_page', unicode(web_page.url).encode("utf-8"))) logger.debug(u"Removing %s " % web_page.url) self.visited_urls.add(web_page) self.non_visited_urls.discard(web_page) self.intermediate_urls.discard(web_page) unique_pages = self._get_unique_non_visited_links(web_page) for page in unique_pages: if page not in self.non_visited_urls: self.non_visited_urls.add(page) self.page_queue.put(page) self.added_count += 1 logger.debug("Added link-url %s " % page.encoded_url) self.start_idle_counter = True def wrap_up(self): self.print_stats() IOLoop.instance().stop() print('Done crawling in %d seconds, fetched %s URLs.' % (time.time() - self.start, len(self.visited_urls))) def print_stats(self): print_pages_with_errors(True, self.visited_urls, "broken_external_links.txt") print_pages_with_errors(False, self.visited_urls, "broken_internal_links.txt") print_pages_with_hardcoded_links(self.visited_urls, "hardcoded_url_links.txt") print("\nTotal pages visited : {}\n".format(len(self.visited_urls))) print_pages_to_file("all_internal_pages.txt", False, self.visited_urls) print_pages_to_file("all_external_pages.txt", True, self.visited_urls)