Exemple #1
0
    def handle_response(domain, response):
        app_log_process('handle response %s' % domain)

        error = u''
        body = u''
        effective_url = response.effective_url

        if response.body:
            body = to_unicode(response.body)

        if response.error:
            error = to_unicode(str(response.error))

        app_log_process('handle response result %s %s' % (response.code, error))

        return error, effective_url, body
Exemple #2
0
    def handle_response(domain, response):
        app_log_process('handle response %s' % domain)

        error = u''
        body = u''
        effective_url = response.effective_url

        if response.body:
            body = to_unicode(response.body)

        if response.error:
            error = to_unicode(str(response.error))

        app_log_process('handle response result %s %s' %
                        (response.code, error))

        return error, effective_url, body
Exemple #3
0
    def run(self, domain_id):
        app_log_process('start parsing %s domain_id' % domain_id)

        crawling_result = None
        try:
            crawling_result = self.storage.get_crawling_result(domain_id)
            app_log_process('found response %s %s' %
                            (crawling_result[0], crawling_result[1]))
        except IOError:
            app_log_process('skip by not found sources')
            return_by_raise()

        domain_name, error, effective_url, source = crawling_result

        parsing_result = self.parse_result(domain_name, error, effective_url,
                                           source)
        app_log_process('parsed result %s' % parsing_result[0])

        if parsing_result[0] == RESULT_ERROR:
            yield self.storage.update_by_parser(domain_id, False)
            yield self.storage.clear_relations_from(domain_id)

        elif parsing_result[0] == RESULT_FULL_REDIRECT:
            new_domain_id = yield self.storage.add_domain_custom(
                parsing_result[1])
            yield self.storage.update_by_parser(domain_id, True)
            yield self.storage.clear_relations_from(domain_id)
            yield self.storage.add_relations_from([(domain_id, new_domain_id)])

        elif parsing_result[0] == RESULT_LINKS:
            yield self.storage.update_by_parser(domain_id, True)
            relations = []
            for link in parsing_result[1]:
                new_domain_id = yield self.storage.add_domain_custom(link)
                relations.append((domain_id, new_domain_id))
            yield self.storage.clear_relations_from(domain_id)
            yield self.storage.add_relations_from(relations)

        else:
            raise RuntimeError('Unknown parsing result type %s' %
                               parsing_result[0])

        self.storage.clear_crawling_result(domain_id)
        app_log_process('end parsing process')
Exemple #4
0
    def run(self):
        if pycurl.version.find('c-ares') < 0:
            app_log_process('c-ares not installed (%s)' % pycurl.version, logging.ERROR)

        num_conn = min(len(self.domains), options.crawler_curl_conn)
        app_log_process('start crawling process %d domains, %d conn, %d timeout' % (len(self.domains), num_conn,
                                                                                    options.crawler_curl_timeout))
        tornado.httpclient.AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient",
                                                     max_clients=num_conn)
        http_client = tornado.httpclient.AsyncHTTPClient()

        for i, domain in enumerate(self.domains):
            id, url = domain
            request = tornado.httpclient.HTTPRequest(url, connect_timeout=options.crawler_curl_timeout,
                                                     request_timeout=options.crawler_curl_timeout,
                                                     follow_redirects=True,
                                                     max_redirects=options.crawler_curl_max_redirects)
            http_client.fetch(request, callback=(yield tornado.gen.Callback(i)))

        keys = set(range(len(self.domains)))
        while keys:
            key, response = yield yieldpoints.WaitAny(keys)
            domain = self.domains[key]
            error, effective_url, body = self.handle_response(self.domains[key], response)
            keys.remove(key)
            self.storage.save_crawling_result(str(domain[0]), domain[1], error, effective_url, body)
            self.q.add_parser_task(domain[0])

        http_client.close()
        app_log_process('end crawling process')
Exemple #5
0
def parser_process():
    app_log_process('start parser process')
    log_fds('start')
    log_mem('start')
    q = Q()
    s = Storage()
    parser = Parser(s)

    while True:
        log_fds('start loop')
        log_mem('start loop')
        task = q.get_parser_task()
        if task:
            yield parser.run(task[2])
            q.complete_task(task[0])
        else:
            app_log_process("not found task")
            time.sleep(options.parser_sleep_period_sec)

    app_log_process('end parser process')
Exemple #6
0
def crawler_process():
    app_log_process('start crawler process')
    log_fds('start')
    log_mem('start')
    q = Q()
    s = Storage()

    while True:
        log_fds('start loop')
        log_mem('start loop')
        task = q.get_crawler_task()
        if task:
            crawler = Crawler(task[2], q, s)
            yield crawler.run()
            q.complete_task(task[0])
            del crawler
        else:
            app_log_process("not found task")
            time.sleep(options.crawler_sleep_period_sec)

    app_log_process('end crawler process')
    log_fds('end')
Exemple #7
0
def crawler_process():
    app_log_process('start crawler process')
    log_fds('start')
    log_mem('start')
    q = Q()
    s = Storage()

    while True:
        log_fds('start loop')
        log_mem('start loop')
        task = q.get_crawler_task()
        if task:
            crawler = Crawler(task[2], q, s)
            yield crawler.run()
            q.complete_task(task[0])
            del crawler
        else:
            app_log_process("not found task")
            time.sleep(options.crawler_sleep_period_sec)

    app_log_process('end crawler process')
    log_fds('end')
Exemple #8
0
    def run(self):
        if pycurl.version.find('c-ares') < 0:
            app_log_process('c-ares not installed (%s)' % pycurl.version,
                            logging.ERROR)

        num_conn = min(len(self.domains), options.crawler_curl_conn)
        app_log_process(
            'start crawling process %d domains, %d conn, %d timeout' %
            (len(self.domains), num_conn, options.crawler_curl_timeout))
        tornado.httpclient.AsyncHTTPClient.configure(
            "tornado.curl_httpclient.CurlAsyncHTTPClient",
            max_clients=num_conn)
        http_client = tornado.httpclient.AsyncHTTPClient()

        for i, domain in enumerate(self.domains):
            id, url = domain
            request = tornado.httpclient.HTTPRequest(
                url,
                connect_timeout=options.crawler_curl_timeout,
                request_timeout=options.crawler_curl_timeout,
                follow_redirects=True,
                max_redirects=options.crawler_curl_max_redirects)
            http_client.fetch(request,
                              callback=(yield tornado.gen.Callback(i)))

        keys = set(range(len(self.domains)))
        while keys:
            key, response = yield yieldpoints.WaitAny(keys)
            domain = self.domains[key]
            error, effective_url, body = self.handle_response(
                self.domains[key], response)
            keys.remove(key)
            self.storage.save_crawling_result(str(domain[0]), domain[1], error,
                                              effective_url, body)
            self.q.add_parser_task(domain[0])

        http_client.close()
        app_log_process('end crawling process')
Exemple #9
0
    def parse_result(self, domain_name, error, effective_url, source):
        if error:
            app_log_process('parse unknown error %s' % error, logging.DEBUG)
            return RESULT_ERROR, error

        final_domain = self.e.extract(effective_url)
        if not final_domain:
            app_log_process('parse redirect error %s' % effective_url,
                            logging.DEBUG)
            return RESULT_ERROR, 'invalid redirect to %s' % effective_url

        if final_domain != domain_name:
            app_log_process(
                'parse full redirect %s -> %s' % (domain_name, final_domain),
                logging.DEBUG)
            return RESULT_FULL_REDIRECT, final_domain

        s = source.lower().strip().encode('utf-8')
        if not s:
            app_log_process('parse empty source error %d' % len(s),
                            logging.DEBUG)
            return RESULT_ERROR, 'empty source %s' % len(s)

        if len(s) > options.parser_max_source_size_mb * 1024 * 1024:
            app_log_process('parse large source error %d' % len(s),
                            logging.DEBUG)
            return RESULT_ERROR, 'large source %s' % len(s)

        try:
            document = self.create_html_doc(s)
        except lxml.etree.ParserError as e:
            app_log_process('parser error %s' % e, logging.WARNING)
            return RESULT_ERROR, 'error parser %s' % e

        # a href
        href_links_source = document.xpath('//a/@href')
        app_log_process(
            'found a@href links %d (%s)' %
            (len(href_links_source), ','.join(href_links_source[:10])),
            logging.DEBUG)
        href_links = self._links_domain_filter(href_links_source, domain_name)
        app_log_process(
            'filtered a@href links %d (%s)' %
            (len(href_links), ','.join(href_links[:10])), logging.DEBUG)

        # # script src js
        # script_links = self._links_domain_filter(document.xpath('//script/@src'))
        # app_log_process('found script@src links %d (%s)' % (len(script_links), ','.join(script_links)), logging.DEBUG)
        #
        # # link href css
        # link_links = self._links_domain_filter(document.xpath('//link[contains(@rel, "stylesheet")]/@href'))
        # app_log_process('found link@href links %d (%s)' % (len(link_links), ','.join(link_links)), logging.DEBUG)
        #
        # # img src
        # img_links = self._links_domain_filter(document.xpath('//img/@src'))
        # app_log_process('found img@src links %d (%s)' % (len(img_links), ','.join(img_links)), logging.DEBUG)

        if len(href_links) > options.parser_max_link_count:
            app_log_process('parse too many links error %d' % len(href_links),
                            logging.DEBUG)
            return RESULT_ERROR, 'too many links %s' % len(href_links)

        return RESULT_LINKS, href_links