Example #1
0
    def get_data(cls, account, source_filter, limit=100, skip=0):
        # set up our queue and semaphore
        queue = JoinableQueue()
        sem = BoundedSemaphore(FETCH_CONCURRENCY)
        done, working = set(), set()
        data = []

        # set up our coroutine to fetch our pages
        @gen.coroutine
        def fetch_url():
            current_url = yield queue.get()
            try:
                if current_url in working:
                    return
                page_no = working.__len__()
                # add the url we're fetching to our working set
                working.add(current_url)
                # and get it
                req = account.get_request(current_url)
                client = AsyncHTTPClient()
                response = yield client.fetch(req)
                # now we add our url to the set of done pages
                done.add(current_url)
                # and append the data we've received
                response_data = json.loads(response.body.decode('utf-8'))
                data.__add__(response_data.get('items', []))
                # check to see if there is a next page
                url = response_data.get('@odata.nextLink', None)
                if url is not None:
                    # and if there is one, stuff it in the queue
                    queue.put(url)
            finally:
                # ...and finally signal that we're done and release our semaphore
                queue.task_done()
                sem.release()

        # and set up the coroutine for our worker
        @gen.coroutine
        def worker():
            while True:
                yield sem.acquire()
                fetch_url()

        # the urls we will be fetching data from
        uris = ['http://some_paginated_odata_api/example/', 'http://some_paginated_odata_api/example2/']
        # fill our queue
        for uri in uris:
            queue.put(url)
        # start our queue worker
        worker()
        # wait until we're done
        yield queue.join(deadline=timedelta(seconds=MAXIMUM_REQ_TIME))

        # this helper function will "format" our data according to the schema we've specified above
        formatted = cls.format_data_to_schema(data)

        # and we're done
        return json.dumps(formatted)
Example #2
0
    def __init__(self, server, conn, stream, io_loop=None, *args, **kwargs):
        self.server = server
        self.stream = stream
        self.conn = conn(self,
                         main_server_ip=self.server.ipaddr,
                         main_server_port=self.server.port,)
        self.session_key = None

        self.state = CONNECTING
        self.io_loop = io_loop
        self.stream.set_close_callback(self.socket_closed)
        self.job_queue = JoinableQueue()
        self.registered_cmds = {}
        self.cnt_number = '0000'
        self.io_loop.add_future(self.init_workflow(),
                                lambda future: future.result())
        super(TerminalSession, self).__init__(*args, **kwargs)
    def __init__(self, start_url, sitemap_url=None, max_concurrent_connections=MAX_CONCURRENT_REQUESTS_PER_SERVER):

        self.visited_urls = set()
        self.intermediate_urls = set()
        self.base_domain = extract_domain(start_url)
        self.base_site = extract_base_site(start_url)
        self.base_page = _get_client_page(start_url, None, start_url, self.base_domain, DOMAINS_TO_BE_SKIPPED)
        self.non_visited_urls = {self.base_page}
        self.added_count = 1
        self.idle_ping = 0
        self.start_idle_counter = False
        self.sitemap_url = u'{}/sitemap.xml'.format(self.base_site) if not sitemap_url else sitemap_url
        self.max_concurrent_connections = max_concurrent_connections

        self.page_queue = JoinableQueue()
        self.semaphore = BoundedSemaphore(self.max_concurrent_connections)
        self.start = time.time()
        self.skip_count = 0
Example #4
0
class TerminalSession(RTOManager, QueclinkProtocol):

    r"""Base session implementation class.
    Session is shared object and low-level code for connection.

        Parameters
        ----------
            session_key - IMEI of device
    """

    STOP_FLAG = False

    def __init__(self, server, conn, stream, io_loop=None, *args, **kwargs):
        self.server = server
        self.stream = stream
        self.conn = conn(self,
                         main_server_ip=self.server.ipaddr,
                         main_server_port=self.server.port,)
        self.session_key = None

        self.state = CONNECTING
        self.io_loop = io_loop
        self.stream.set_close_callback(self.socket_closed)
        self.job_queue = JoinableQueue()
        self.registered_cmds = {}
        self.cnt_number = '0000'
        self.io_loop.add_future(self.init_workflow(),
                                lambda future: future.result())
        super(TerminalSession, self).__init__(*args, **kwargs)

    def is_open(self):
        return self.state == OPEN

    def is_closed(self):
        return self.state == CLOSED

    @gen.coroutine
    def open(self):
        """Opens the connect. According to the protocol,
        we should first configure the device and after than
        connection should be flagged as opened."""
        # unique_id = yield self.conn.configure()
        log = yield self.conn.verify_conn()
        unique_id = log.unique_id
        gen_log.info('CONNECTION OPENED WITH: %s' % unique_id)
        if unique_id:
            self.session_key = unique_id
            self.state = OPEN
            self.conn.on_open(unique_id)
            raise gen.Return(unique_id)
        raise gen.Return(conf.DISCONN_RESULT)

    @gen.coroutine
    def read_message(self):
        r"""Callbacks takes two arguments: binary data and job code."""
        if self.stream.closed():
            raise StreamClosedError("Stream is closed")
        message = yield gen.Task(self.stream.read_until, conf.END_SIGN)
        raise gen.Return(message)

    @gen.coroutine
    def terminal_message_flow(self, msg):
        r"""Sets message flow"""
        try:
            log, sack, from_buffer = super(
                TerminalSession, self).terminal_message_flow(msg)
        except MessageNotImplemented as e:  # silence exc
            gen_log.exception(e)
            return
        count_num = log.log.count_number
        if log.type == conf.ACK:
            if not log.header == conf.HEARTBEAT_ACK:
                self.unregister_command_on_ack(log.log)
            yield self.conn.on_ack(msg, log, sack)
        else:
            if self.is_pending_rto(log.header):
                self.make_rto_response(log)
            # bad accuracy of gps
            # may be warn by email our guys that gps accuracy is weak
            # skip message logic
            if log.header == conf.FIXED_REPORT:
                gps_accuracy = int(log.log.gps_accuracy)
                if not gps_accuracy:    # or (20 < gps_accuracy <= 50):
                    self.skip_message = True
                else:
                    self.skip_message = False
            if getattr(self, 'skip_message', False):
                gen_log.info("Hey, GPS ACCURACY IS BAD")
                return
            yield self.conn.on_report(msg, log, sack, from_buffer=from_buffer)
        if not self.session_key and hasattr(log.log, 'unique_id'):
            self.session_key = log.log.unique_id
            self.state = OPEN
        raise gen.Return(count_num)

    def exec_command(self, msg_tp, header, body):
        r"""Sends message to the end-point. Returns promiseable future."""
        if not msg_tp == conf.COMMAND:
            raise
        serial_number, cmd_future = self.register_command_for_ack()
        body.update({'serial_number': serial_number})
        self.send_message(self.build_cmd(header, **body))
        return cmd_future

    def send_message(self, msg):
        self.stream.write(msg)
        gen_log.info("SACK: %s", msg)

    def should_stop(self):
        return self.STOP_FLAG

    @gen.coroutine
    def init_workflow(self):
        schedule_at_loop(self.io_loop, self._tail_messagebus,
                         callback=self._handle_message_flow)
        schedule_at_loop(self.io_loop, self._tail_stream_buffer,
                         callback=self._handle_message_flow)

    @gen.coroutine
    def _tail_messagebus(self):

        def job_complete(f):
            self.cnt_number = f.result()

        while True:
            if self.should_stop():
                break
            message = yield self.job_queue.get()
            schedule_at_loop(self.io_loop, self.terminal_message_flow(message),
                             job_complete)
            self.job_queue.task_done()
            gen_log.info("INCOMING MSG: %s", message)

    @gen.coroutine
    def _tail_stream_buffer(self):
        while True:
            if self.should_stop():
                break
            message = yield self.read_message()
            yield self.job_queue.put(message)

    def _handle_message_flow(self, future):
        # some other errors that I do not know yet that can
        # lead to memory leaks due to file descriptor will no be closed
        try:
            future.result()
        except Exception as e:
            app_log.exception(e)
        finally:
            self.STOP_FLAG = True
            self.close()

    def register_command_for_ack(self):
        r"""Returns serial number and future promise"""
        serial_number = ''
        while not serial_number or serial_number in self.registered_cmds:
            serial_number = generate_random_hex()
        f = self.registered_cmds[serial_number] = Future()
        return serial_number, f

    def unregister_command_on_ack(self, log):
        serial_number = log.serial_number
        future = self.registered_cmds.pop(serial_number, None)
        if not future:
            return
        if future._done:
            return future
        if not isinstance(log, Exception):
            future.set_result(log)
        else:
            future.set_exception(log)

    def unregister_commands(self):
        if not self.is_open():
            for evt in self.registered_cmds.keys():
                f = self.registered_cmds.pop(evt)
                f.set_result(conf.DISCONN_RESULT)
            while self.registered_cmds:
                self.registered_cmds

    def socket_closed(self):
        self.state = CLOSING
        self.close()

    def close(self):
        self.unregister_commands()
        self.server.close_session(self.session_key)
        self.conn.on_close()
        self.stream.close()
        self.state = CLOSED
        gen_log.info("CONNECTION CLOSED: %s", self.session_key)
class TornadoSpider:
    def __init__(self, start_url, sitemap_url=None, max_concurrent_connections=MAX_CONCURRENT_REQUESTS_PER_SERVER):

        self.visited_urls = set()
        self.intermediate_urls = set()
        self.base_domain = extract_domain(start_url)
        self.base_site = extract_base_site(start_url)
        self.base_page = _get_client_page(start_url, None, start_url, self.base_domain, DOMAINS_TO_BE_SKIPPED)
        self.non_visited_urls = {self.base_page}
        self.added_count = 1
        self.idle_ping = 0
        self.start_idle_counter = False
        self.sitemap_url = u'{}/sitemap.xml'.format(self.base_site) if not sitemap_url else sitemap_url
        self.max_concurrent_connections = max_concurrent_connections

        self.page_queue = JoinableQueue()
        self.semaphore = BoundedSemaphore(self.max_concurrent_connections)
        self.start = time.time()
        self.skip_count = 0


    @coroutine
    def initiate_crawl(self):
        self.non_visited_urls.add(self.base_page)
        self.add_sitemap_urls(self.base_page)
        self.page_queue.put(self.base_page)
        self._crawl_web_page()
        yield self.page_queue.join()

    @coroutine
    def _crawl_web_page(self):
        while True:
            if len(self.intermediate_urls) < 5 and self.start_idle_counter:
                print("Unprocessed urls : ")
                for page in self.intermediate_urls:
                    print(u'>>>>>> %s ' % page.encoded_url)
            # print("Available Semaphore %s" % self.semaphore.counter)
            yield self.semaphore.acquire()
            # print("0.Issued Semaphore %s  " % (self.semaphore.counter+1))
            self._fetch_page(self.semaphore.counter + 1)
            if len(self.intermediate_urls) < 5 and self.start_idle_counter:
                print("Unprocessed urls : ")
                for page in self.intermediate_urls:
                    print(u'>> %s ' % page.encoded_url)
                self.wrap_up()

    @coroutine
    def _fetch_page(self, semaphore_count):
        try:
            page = yield self.page_queue.get()
            if page in self.visited_urls or page in self.intermediate_urls:
                return
            if page.skip_page():
                self.skip_count += 1
                logger.debug("Skipped {} " % page.url)
                return

            logger.debug(
                u"1.Sempahore in use> %s int.count %s for %s" % (semaphore_count, len(self.intermediate_urls),
                                                                 page.encoded_url))
            self.intermediate_urls.add(page)
            page.process(self)
            response = yield page.make_head_request()
            get_response = yield page._process_head_response(response)
            if get_response:
                page.process_get_response(get_response)
            print(
                u"Total urls added :  {} , Total urls visited : {} , Total urls in process : {} Skipped : {},"
                u" semaphore used : {} " \
                .format(self.added_count, len(self.visited_urls), len(self.intermediate_urls), self.skip_count,
                        semaphore_count))

            logger.debug(
                u"Total urls added :  {} , Total urls visited : {} , Total urls in process : {} Skipped : {}, "
                u"semaphore {}"
                .format(self.added_count, len(self.visited_urls), len(self.intermediate_urls), self.skip_count,
                        self.semaphore.counter))
        except Exception as ex:
            logger.debug(ex)
        finally:
            self.page_queue.task_done()
            self.semaphore.release()
            logger.debug(
                u"2.Sempahore returned>> %s  available %s after %s" % (semaphore_count, self.semaphore.counter,
                                                                       page.encoded_url))

    def _filter_visited_links(self, page):
        return page not in self.visited_urls and page not in self.intermediate_urls and page not in self.non_visited_urls

    def add_sitemap_urls(self, parent_page):
        logger.debug("Adding sitemap urls as well for processing")
        http_client = HTTPClient()
        try:
            response = http_client.fetch(self.sitemap_url)
            val = bytes(response.body)
            root = objectify.fromstring(val)

            for url_element in root.url:
                page = _get_client_page(decode_to_unicode(url_element.loc.text), parent_page, self.base_site,
                                        self.base_domain, DOMAINS_TO_BE_SKIPPED)
                if page not in self.visited_urls and page not in self.non_visited_urls \
                        and page not in self.intermediate_urls:
                    print(u"Added {}".format(url_element.loc))
                    self.non_visited_urls.add(page)
                    self.added_count += 1
                    self.page_queue.put(page)

        except Exception as e:
            logger.error(u"Error adding sitemap urls from %s " % self.sitemap_url)
        finally:
            http_client.close()

    def _get_unique_non_visited_links(self, page):
        l = Lock()
        l.acquire()
        filtered_links = set(filter(self._filter_visited_links, page.links))
        l.release()
        return filtered_links

    def process_web_page(self, web_page):
        logger.debug(u"Called {} for {}".format('process_web_page', unicode(web_page.url).encode("utf-8")))
        logger.debug(u"Removing %s " % web_page.url)
        self.visited_urls.add(web_page)
        self.non_visited_urls.discard(web_page)
        self.intermediate_urls.discard(web_page)
        unique_pages = self._get_unique_non_visited_links(web_page)

        for page in unique_pages:
            if page not in self.non_visited_urls:
                self.non_visited_urls.add(page)
                self.page_queue.put(page)
                self.added_count += 1
                logger.debug("Added link-url %s " % page.encoded_url)

        self.start_idle_counter = True

    def wrap_up(self):
        self.print_stats()
        IOLoop.instance().stop()
        print('Done crawling in %d seconds, fetched %s URLs.' % (time.time() - self.start, len(self.visited_urls)))

    def print_stats(self):
        print_pages_with_errors(True, self.visited_urls, "broken_external_links.txt")
        print_pages_with_errors(False, self.visited_urls, "broken_internal_links.txt")
        print_pages_with_hardcoded_links(self.visited_urls, "hardcoded_url_links.txt")

        print("\nTotal pages visited : {}\n".format(len(self.visited_urls)))

        print_pages_to_file("all_internal_pages.txt", False, self.visited_urls)
        print_pages_to_file("all_external_pages.txt", True, self.visited_urls)