Example #1
0
    def test_a110_one(self):
        pid, fd = os.forkpty()
        #cmd = [sys.executable]
        cmd = ['coverage', 'run']
        cmd += [
            inspect.getsourcefile(run),
            'one',
            '-i',
            inspect.getsourcefile(data_sample_handler)
        ]

        if pid == 0:
            # child
            os.execvp(cmd[0], cmd)
        else:
            # parent
            def wait_text(timeout=1):
                import select
                text = []
                while True:
                    rl, wl, xl = select.select([fd], [], [], timeout)
                    if not rl:
                        break
                    try:
                        t = os.read(fd, 1024)
                    except OSError:
                        break
                    if not t:
                        break
                    t = utils.text(t)
                    text.append(t)
                    print(t, end='')
                return ''.join(text)

            text = wait_text()
            self.assertIn('new task data_sample_handler:on_start', text)
            self.assertIn('pyspider shell', text)

            os.write(fd, utils.utf8('run()\n'))
            text = wait_text()
            self.assertIn('task done data_sample_handler:on_start', text)

            os.write(fd, utils.utf8('crawl("%s/pyspider/test.html")\n' % self.httpbin))
            text = wait_text()
            self.assertIn('/robots.txt', text)

            os.write(fd, utils.utf8('crawl("%s/links/10/0")\n' % self.httpbin))
            text = wait_text(2)
            self.assertIn('"title": "Links"', text)

            os.write(fd, utils.utf8('crawl("%s/404")\n' % self.httpbin))
            text = wait_text()
            self.assertIn('task retry', text)

            os.write(fd, b'quit_pyspider()\n')
            text = wait_text()
            self.assertIn('scheduler exiting...', text)
            os.close(fd)
            os.kill(pid, signal.SIGINT)
Example #2
0
    def test_a110_one(self):
        pid, fd = os.forkpty()
        #cmd = [sys.executable]
        cmd = ['coverage', 'run']
        cmd += [
            inspect.getsourcefile(run),
            'one',
            '-i',
            inspect.getsourcefile(data_sample_handler)
        ]

        if pid == 0:
            # child
            os.execvp(cmd[0], cmd)
        else:
            # parent
            def wait_text(timeout=1):
                import select
                text = []
                while True:
                    rl, wl, xl = select.select([fd], [], [], timeout)
                    if not rl:
                        break
                    try:
                        t = os.read(fd, 1024)
                    except OSError:
                        break
                    if not t:
                        break
                    t = utils.text(t)
                    text.append(t)
                    print(t, end='')
                return ''.join(text)

            text = wait_text()
            self.assertIn('new task data_sample_handler:on_start', text)
            self.assertIn('pyspider shell', text)

            os.write(fd, utils.utf8('run()\n'))
            text = wait_text()
            self.assertIn('task done data_sample_handler:on_start', text)

            os.write(fd, utils.utf8('crawl("%s/pyspider/test.html")\n' % self.httpbin))
            text = wait_text()
            self.assertIn('/robots.txt', text)

            os.write(fd, utils.utf8('crawl("%s/links/10/0")\n' % self.httpbin))
            text = wait_text()
            self.assertIn('"title": "Links"', text)

            os.write(fd, utils.utf8('crawl("%s/404")\n' % self.httpbin))
            text = wait_text()
            self.assertIn('task retry', text)

            os.write(fd, b'quit_pyspider()\n')
            text = wait_text()
            self.assertIn('scheduler exiting...', text)
            os.close(fd)
            os.kill(pid, signal.SIGINT)
Example #3
0
        def generator():
            stringio = StringIO.StringIO()
            csv_writer = csv.writer(stringio)

            it = iter(resultdb.select(project))
            first_30 = []
            for result in it:
                first_30.append(result)
                if len(first_30) >= 30:
                    break
            common_fields, _ = result_formater(first_30)
            common_fields_l = sorted(common_fields)

            csv_writer.writerow(["url"] + [utf8(x) for x in common_fields_l] + ["..."])
            for result in itertools.chain(first_30, it):
                other = {}
                for k, v in result["result"].iteritems():
                    if k not in common_fields:
                        other[k] = v
                csv_writer.writerow(
                    [toString(result["url"])]
                    + [toString(result["result"].get(k, "")) for k in common_fields_l]
                    + [toString(other)]
                )
                yield stringio.getvalue()
                stringio.truncate(0)
Example #4
0
        def generator():
            stringio = StringIO.StringIO()
            csv_writer = csv.writer(stringio)

            it = iter(resultdb.select(project))
            first_30 = []
            for result in it:
                first_30.append(result)
                if len(first_30) >= 30:
                    break
            common_fields, _ = result_formater(first_30)
            common_fields_l = sorted(common_fields)

            csv_writer.writerow(['url']
                                + [utf8(x) for x in common_fields_l]
                                + ['...'])
            for result in itertools.chain(first_30, it):
                other = {}
                for k, v in result['result'].iteritems():
                    if k not in common_fields:
                        other[k] = v
                csv_writer.writerow(
                    [toString(result['url'])]
                    + [toString(result['result'].get(k, '')) for k in common_fields_l]
                    + [toString(other)]
                )
                yield stringio.getvalue()
                stringio.truncate(0)
Example #5
0
 def _stringify(data):
     if 'result' in data:
         data['result'] = json.dumps(data['result'])
     if six.PY3:
         for key, value in list(six.iteritems(data)):
             if isinstance(value, six.string_types):
                 data[key] = utils.utf8(value)
     return data
Example #6
0
 def getMemberNames(self):
     members = []
     for project in self.projectdb.get_all(fields=["name"]):
         project_name = utf8(project["name"])
         if not project_name.endswith(".py"):
             project_name += ".py"
         members.append(project_name)
     return members
Example #7
0
 def getMemberList(self):
     members = []
     for project in self.projectdb.get_all():
         project_name = utf8(project["name"])
         if not project_name.endswith(".py"):
             project_name += ".py"
         members.append(ScriptResource(os.path.join(self.path, project_name), self.environ, self.app, project))
     return members
Example #8
0
 def getMemberNames(self):
     members = []
     for project in self.projectdb.get_all(fields=['name', ]):
         project_name = project['name']
         if not project_name.endswith('.py'):
             project_name += '.py'
         members.append(utf8(project_name))
     return members
Example #9
0
 def _stringify(data):
     if 'result' in data:
         data['result'] = json.dumps(data['result'])
     if six.PY3:
         for key, value in list(six.iteritems(data)):
             if isinstance(value, six.string_types):
                 data[key] = utils.utf8(value)
     return data
Example #10
0
 def getMemberNames(self):
     members = []
     for project in self.projectdb.get_all(fields=['name', ]):
         project_name = utf8(project['name'])
         if not project_name.endswith('.py'):
             project_name += '.py'
         members.append(project_name)
     return members
Example #11
0
 def _stringify(data):
     for each in ('schedule', 'fetch', 'process', 'track'):
         if each in data:
             data[each] = json.dumps(data[each])
     if six.PY3:
         for key, value in list(six.iteritems(data)):
             if isinstance(value, six.string_types):
                 data[key] = utils.utf8(value)
     return data
Example #12
0
 def getMemberList(self):
     members = []
     for project in self.projectdb.get_all():
         project_name = utf8(project['name'])
         if not project_name.endswith('.py'):
             project_name += '.py'
         members.append(
             ScriptResource(os.path.join(self.path, project_name),
                            self.environ, self.app, project))
     return members
Example #13
0
 def getMemberList(self):
     members = []
     for project in self.projectdb.get_all():
         project_name = project['name']
         if not project_name.endswith('.py'):
             project_name += '.py'
         native_path = os.path.join(self.path, project_name)
         native_path = text(native_path) if six.PY3 else utf8(native_path)
         members.append(
             ScriptResource(native_path, self.environ, self.app, project))
     return members
Example #14
0
 def getMemberList(self):
     members = []
     for project in self.projectdb.get_all():
         project_name = project['name']
         if not project_name.endswith('.py'):
             project_name += '.py'
         native_path = os.path.join(self.path, project_name)
         native_path = text(native_path) if six.PY3 else utf8(native_path)
         members.append(ScriptResource(
             native_path,
             self.environ,
             self.app,
             project
         ))
     return members
Example #15
0
 def _stringify(data):
     if six.PY3:
         for key, value in list(six.iteritems(data)):
             if isinstance(value, six.string_types):
                 data[key] = utils.utf8(value)
     return data
Example #16
0
    def puppeteer_fetch(self, url, task):
        '''Fetch with puppeteer proxy'''
        start_time = time.time()
        self.on_fetch('puppeteer', task)
        handle_error = lambda x: self.handle_error('puppeteer', url, task, start_time, x)

        # check puppeteer proxy is enabled
        if not self.puppeteer_proxy:
            result = {
                "orig_url": url,
                "content": "puppeteer is not enabled.",
                "headers": {},
                "status_code": 501,
                "url": url,
                "time": time.time() - start_time,
                "cookies": {},
                "save": task.get('fetch', {}).get('save')
            }
            logger.warning("[501] %s:%s %s 0s", task.get('project'), task.get('taskid'), url)
            raise gen.Return(result)

        # setup request parameters
        fetch = self.pack_tornado_request_parameters(url, task)
        task_fetch = task.get('fetch', {})
        for each in task_fetch:
            if each not in fetch:
                fetch[each] = task_fetch[each]

        # robots.txt
        if task_fetch.get('robots_txt', False):
            user_agent = fetch['headers']['User-Agent']
            can_fetch = yield self.can_fetch(user_agent, url)
            if not can_fetch:
                error = tornado.httpclient.HTTPError(403, 'Disallowed by robots.txt')
                raise gen.Return(handle_error(error))

        request_conf = {
            'follow_redirects': False
        }
        request_conf['connect_timeout'] = fetch.get('connect_timeout', 20)
        request_conf['request_timeout'] = fetch.get('request_timeout', 120) + 1

        session = cookies.RequestsCookieJar()
        if 'Cookie' in fetch['headers']:
            c = http_cookies.SimpleCookie()
            try:
                c.load(fetch['headers']['Cookie'])
            except AttributeError:
                c.load(utils.utf8(fetch['headers']['Cookie']))
            for key in c:
                session.set(key, c[key])
            del fetch['headers']['Cookie']
        if 'cookies' in fetch:
            session.update(fetch['cookies'])
            del fetch['cookies']

        request = tornado.httpclient.HTTPRequest(url=fetch['url'])
        cookie_header = cookies.get_cookie_header(session, request)
        if cookie_header:
            fetch['headers']['Cookie'] = cookie_header

        logger.info("%s", self.puppeteer_proxy)
        # making requests
        fetch['headers'] = dict(fetch['headers'])
        headers = {}
        headers['Content-Type'] = 'application/json; charset=UTF-8'
        try:
            request = tornado.httpclient.HTTPRequest(
                url=self.puppeteer_proxy, method="POST", headers=headers,
                body=json.dumps(fetch), **request_conf)
        except Exception as e:
            raise gen.Return(handle_error(e))

        try:
            response = yield gen.maybe_future(self.http_client.fetch(request))
        except tornado.httpclient.HTTPError as e:
            if e.response:
                response = e.response
            else:
                raise gen.Return(handle_error(e))

        if not response.body:
            raise gen.Return(handle_error(Exception('no response from puppeteer: %r' % response)))

        result = {}
        try:
            result = json.loads(utils.text(response.body))
            assert 'status_code' in result, result
        except Exception as e:
            if response.error:
                result['error'] = utils.text(response.error)
            raise gen.Return(handle_error(e))

        if result.get('status_code', 200):
            logger.info("[%d] %s:%s %s %.2fs", result['status_code'],
                        task.get('project'), task.get('taskid'), url, result['time'])
        else:
            logger.error("[%d] %s:%s %s, %r %.2fs", result['status_code'],
                         task.get('project'), task.get('taskid'),
                         url, result['content'], result['time'])

        raise gen.Return(result)
Example #17
0
 def _stringify(data):
     for each in ('schedule', 'fetch', 'process', 'track'):
         if each in data:
             data[each] = utils.utf8(json.dumps(data[each]))
     return data
Example #18
0
 def _stringify(data):
     if 'result' in data:
         data['result'] = utils.utf8(json.dumps(data['result']))
     return data
Example #19
0
 def _stringify(data):
     if six.PY3:
         for key, value in list(six.iteritems(data)):
             if isinstance(value, six.string_types):
                 data[key] = utils.utf8(value)
     return data
Example #20
0
 def _stringify(data):
     if 'result' in data:
         data['result'] = utils.utf8(json.dumps(data['result']))
     return data
Example #21
0
    def http_fetch(self, url, task):
        '''HTTP fetcher'''
        start_time = time.time()
        self.on_fetch('http', task)
        handle_error = lambda x: self.handle_error('http', url, task,
                                                   start_time, x)

        # setup request parameters
        fetch = self.pack_tornado_request_parameters(url, task)
        task_fetch = task.get('fetch', {})

        session = cookies.RequestsCookieJar()
        # fix for tornado request obj
        if 'Cookie' in fetch['headers']:
            c = http_cookies.SimpleCookie()
            try:
                c.load(fetch['headers']['Cookie'])
            except AttributeError:
                c.load(utils.utf8(fetch['headers']['Cookie']))
            for key in c:
                session.set(key, c[key])
            del fetch['headers']['Cookie']
        if 'cookies' in fetch:
            session.update(fetch['cookies'])
            del fetch['cookies']

        max_redirects = task_fetch.get('max_redirects', 5)
        # we will handle redirects by hand to capture cookies
        fetch['follow_redirects'] = False

        # making requests
        while True:
            # robots.txt
            if task_fetch.get('robots_txt', False):
                can_fetch = yield self.can_fetch(
                    fetch['headers']['User-Agent'], fetch['url'])
                if not can_fetch:
                    error = tornado.httpclient.HTTPError(
                        403, 'Disallowed by robots.txt')
                    raise gen.Return(handle_error(error))

            try:
                request = tornado.httpclient.HTTPRequest(**fetch)
                # if cookie already in header, get_cookie_header wouldn't work
                old_cookie_header = request.headers.get('Cookie')
                if old_cookie_header:
                    del request.headers['Cookie']
                cookie_header = cookies.get_cookie_header(session, request)
                if cookie_header:
                    request.headers['Cookie'] = cookie_header
                elif old_cookie_header:
                    request.headers['Cookie'] = old_cookie_header
            except Exception as e:
                logger.exception(fetch)
                raise gen.Return(handle_error(e))

            try:
                response = yield gen.maybe_future(
                    self.http_client.fetch(request))
            except tornado.httpclient.HTTPError as e:
                if e.response:
                    response = e.response
                else:
                    raise gen.Return(handle_error(e))

            extract_cookies_to_jar(session, response.request, response.headers)
            if (response.code in (301, 302, 303, 307)
                    and response.headers.get('Location')
                    and task_fetch.get('allow_redirects', True)):
                if max_redirects <= 0:
                    error = tornado.httpclient.HTTPError(
                        599, 'Maximum (%d) redirects followed' %
                        task_fetch.get('max_redirects', 5), response)
                    raise gen.Return(handle_error(error))
                if response.code in (302, 303):
                    fetch['method'] = 'GET'
                    if 'body' in fetch:
                        del fetch['body']
                fetch['url'] = quote_chinese(
                    urljoin(fetch['url'], response.headers['Location']))
                fetch['request_timeout'] -= time.time() - start_time
                if fetch['request_timeout'] < 0:
                    fetch['request_timeout'] = 0.1
                max_redirects -= 1
                continue

            result = {}
            result['orig_url'] = url
            result['content'] = response.body or ''
            result['headers'] = dict(response.headers)
            result['status_code'] = response.code
            result['url'] = response.effective_url or url
            result['time'] = time.time() - start_time
            result['cookies'] = session.get_dict()
            result['save'] = task_fetch.get('save')
            if response.error:
                result['error'] = utils.text(response.error)
            if 200 <= response.code < 300:
                logger.info("[%d] %s:%s %s %.2fs", response.code,
                            task.get('project'), task.get('taskid'), url,
                            result['time'])
            else:
                logger.warning("[%d] %s:%s %s %.2fs", response.code,
                               task.get('project'), task.get('taskid'), url,
                               result['time'])

            raise gen.Return(result)
Example #22
0
    def puppeteer_fetch(self, url, task):
        '''Fetch with puppeteer proxy'''
        start_time = time.time()
        self.on_fetch('puppeteer', task)
        handle_error = lambda x: self.handle_error('puppeteer', url, task,
                                                   start_time, x)

        # check puppeteer proxy is enabled
        if not self.puppeteer_proxy:
            result = {
                "orig_url": url,
                "content": "puppeteer is not enabled.",
                "headers": {},
                "status_code": 501,
                "url": url,
                "time": time.time() - start_time,
                "cookies": {},
                "save": task.get('fetch', {}).get('save')
            }
            logger.warning("[501] %s:%s %s 0s", task.get('project'),
                           task.get('taskid'), url)
            raise gen.Return(result)

        # setup request parameters
        fetch = self.pack_tornado_request_parameters(url, task)
        task_fetch = task.get('fetch', {})
        for each in task_fetch:
            if each not in fetch:
                fetch[each] = task_fetch[each]

        # robots.txt
        if task_fetch.get('robots_txt', False):
            user_agent = fetch['headers']['User-Agent']
            can_fetch = yield self.can_fetch(user_agent, url)
            if not can_fetch:
                error = tornado.httpclient.HTTPError(
                    403, 'Disallowed by robots.txt')
                raise gen.Return(handle_error(error))

        request_conf = {'follow_redirects': False}
        request_conf['connect_timeout'] = fetch.get('connect_timeout', 20)
        request_conf['request_timeout'] = fetch.get('request_timeout', 120) + 1

        session = cookies.RequestsCookieJar()
        if 'Cookie' in fetch['headers']:
            c = http_cookies.SimpleCookie()
            try:
                c.load(fetch['headers']['Cookie'])
            except AttributeError:
                c.load(utils.utf8(fetch['headers']['Cookie']))
            for key in c:
                session.set(key, c[key])
            del fetch['headers']['Cookie']
        if 'cookies' in fetch:
            session.update(fetch['cookies'])
            del fetch['cookies']

        request = tornado.httpclient.HTTPRequest(url=fetch['url'])
        cookie_header = cookies.get_cookie_header(session, request)
        if cookie_header:
            fetch['headers']['Cookie'] = cookie_header

        logger.info("%s", self.puppeteer_proxy)
        # making requests
        fetch['headers'] = dict(fetch['headers'])
        headers = {}
        headers['Content-Type'] = 'application/json; charset=UTF-8'
        try:
            request = tornado.httpclient.HTTPRequest(url=self.puppeteer_proxy,
                                                     method="POST",
                                                     headers=headers,
                                                     body=json.dumps(fetch),
                                                     **request_conf)
        except Exception as e:
            raise gen.Return(handle_error(e))

        try:
            response = yield gen.maybe_future(self.http_client.fetch(request))
        except tornado.httpclient.HTTPError as e:
            if e.response:
                response = e.response
            else:
                raise gen.Return(handle_error(e))

        if not response.body:
            raise gen.Return(
                handle_error(
                    Exception('no response from puppeteer: %r' % response)))

        result = {}
        try:
            result = json.loads(utils.text(response.body))
            assert 'status_code' in result, result
        except Exception as e:
            if response.error:
                result['error'] = utils.text(response.error)
            raise gen.Return(handle_error(e))

        if result.get('status_code', 200):
            logger.info("[%d] %s:%s %s %.2fs", result['status_code'],
                        task.get('project'), task.get('taskid'), url,
                        result['time'])
        else:
            logger.error("[%d] %s:%s %s, %r %.2fs", result['status_code'],
                         task.get('project'), task.get('taskid'), url,
                         result['content'], result['time'])

        raise gen.Return(result)
Example #23
0
 def _stringify(data):
     for each in ("schedule", "fetch", "process", "track"):
         if each in data:
             data[each] = utils.utf8(json.dumps(data[each]))
     return data
Example #24
0
 def _stringify(data):
     for each in ('schedule', 'fetch', 'process', 'track'):
         if each in data:
             data[each] = utils.utf8(json.dumps(data[each]))
     return data
Example #25
0
 def getContentLength(self):
     return len(utf8(self.project["script"]))
Example #26
0
 def getContent(self):
     return BytesIO(utf8(self.project['script']))
Example #27
0
 def getContentLength(self):
     return len(utf8(self.project['script']))
Example #28
0
    def http_fetch(self, url, task):
        '''HTTP fetcher'''
        start_time = time.time()
        self.on_fetch('http', task)
        handle_error = lambda x: self.handle_error('http', url, task, start_time, x)

        # setup request parameters
        fetch = self.pack_tornado_request_parameters(url, task)
        task_fetch = task.get('fetch', {})

        session = cookies.RequestsCookieJar()
        # fix for tornado request obj
        if 'Cookie' in fetch['headers']:
            c = http_cookies.SimpleCookie()
            try:
                c.load(fetch['headers']['Cookie'])
            except AttributeError:
                c.load(utils.utf8(fetch['headers']['Cookie']))
            for key in c:
                session.set(key, c[key])
            del fetch['headers']['Cookie']
        if 'cookies' in fetch:
            session.update(fetch['cookies'])
            del fetch['cookies']

        max_redirects = task_fetch.get('max_redirects', 5)
        # we will handle redirects by hand to capture cookies
        fetch['follow_redirects'] = False

        # making requests
        while True:
            # robots.txt
            if task_fetch.get('robots_txt', False):
                can_fetch = yield self.can_fetch(fetch['headers']['User-Agent'], fetch['url'])
                if not can_fetch:
                    error = tornado.httpclient.HTTPError(403, 'Disallowed by robots.txt')
                    raise gen.Return(handle_error(error))

            try:
                request = tornado.httpclient.HTTPRequest(**fetch)
                # if cookie already in header, get_cookie_header wouldn't work
                old_cookie_header = request.headers.get('Cookie')
                if old_cookie_header:
                    del request.headers['Cookie']
                cookie_header = cookies.get_cookie_header(session, request)
                if cookie_header:
                    request.headers['Cookie'] = cookie_header
                elif old_cookie_header:
                    request.headers['Cookie'] = old_cookie_header
            except Exception as e:
                logger.exception(fetch)
                raise gen.Return(handle_error(e))

            try:
                response = yield gen.maybe_future(self.http_client.fetch(request))
            except tornado.httpclient.HTTPError as e:
                if e.response:
                    response = e.response
                else:
                    raise gen.Return(handle_error(e))

            extract_cookies_to_jar(session, response.request, response.headers)
            if (response.code in (301, 302, 303, 307)
                    and response.headers.get('Location')
                    and task_fetch.get('allow_redirects', True)):
                if max_redirects <= 0:
                    error = tornado.httpclient.HTTPError(
                        599, 'Maximum (%d) redirects followed' % task_fetch.get('max_redirects', 5),
                        response)
                    raise gen.Return(handle_error(error))
                if response.code in (302, 303):
                    fetch['method'] = 'GET'
                    if 'body' in fetch:
                        del fetch['body']
                fetch['url'] = quote_chinese(urljoin(fetch['url'], response.headers['Location']))
                fetch['request_timeout'] -= time.time() - start_time
                if fetch['request_timeout'] < 0:
                    fetch['request_timeout'] = 0.1
                max_redirects -= 1
                continue

            result = {}
            result['orig_url'] = url
            result['content'] = response.body or ''
            result['headers'] = dict(response.headers)
            result['status_code'] = response.code
            result['url'] = response.effective_url or url
            result['time'] = time.time() - start_time
            result['cookies'] = session.get_dict()
            result['save'] = task_fetch.get('save')
            if response.error:
                result['error'] = utils.text(response.error)
            if 200 <= response.code < 300:
                logger.info("[%d] %s:%s %s %.2fs", response.code,
                            task.get('project'), task.get('taskid'),
                            url, result['time'])
            else:
                logger.warning("[%d] %s:%s %s %.2fs", response.code,
                               task.get('project'), task.get('taskid'),
                               url, result['time'])

            raise gen.Return(result)
Example #29
0
 def getContent(self):
     return BytesIO(utf8(self.project["script"]))