コード例 #1
0
ファイル: spider.py プロジェクト: JFK/spider
    def main(self, url, response, referer=None, tag=0):
        try:
            if not self.status:
                logging.info('status stopped...')
                err = {
                    'code': 500,
                    'url': url,
                    'msg': 'status stopped...',
                    'at': datetime.utcnow()
                }
                raise SpiderError(err)
            self.referer = referer
            logging.info('referer... %s' % referer)
            proxies = select_proxy()
            logging.info('proxy...')
            logging.info(proxies)
            h = self.headers
            logging.info(h)
            logging.info('cookies...')
            c = self.cookies
            logging.info(c)
            resp = requests.get(url, headers=h, cookies=c, proxies=proxies)
            resp.encoding = self.encoding
            self.worker_cookies(resp.cookies)
            resp.headers.update({'Referer': url})
            self.worker_headers(resp.headers)
            logging.info('resp.headers...')
            logging.info(resp.headers)
            if resp.status_code == 404:
                # do nothing
                logging.info('Not found... %d' % resp.status_code)

            elif resp.status_code != 200:
                logging.info('Not 200... %d' % resp.status_code)
                err = {
                    'code': resp.status_code,
                    'url': url,
                    'msg': 'not 200',
                    'at': datetime.utcnow()
                }
                raise SpiderError(err)
            else:
                parsed = urlparse(url)
                logging.info(parsed)
                kwargs = dict(
                    pname=self.pname,
                    url=url,
                    text=resp.text if SAVE_TEXT else '',
                    headers=resp.headers if SAVE_HEADERS else {},
                    encoding=self.encoding,
                    response=response,
                    scheme=parsed.scheme,
                    host=parsed.netloc,
                    query=parsed.query,
                    path=parsed.path
                )

                logging.info('response tag... %d' % tag)
                soup = BeautifulSoup(resp.text, "lxml")
                tag, urls, opt = response(self, soup, tag, **kwargs)
                kwargs.update({'tag': tag})
                kwargs.update({'option': opt})
                self.visited(url, kwargs)
                logging.info('urls... %d', len(urls))
                if not urls:
                    logging.info('No urls...')
                else:
                    i = 0
                    while len(self.jobs) > self.max_job_count:
                        if i == 10:
                            break
                        msg = 'busy... %d > %d' % \
                            (len(self.jobs), self.max_job_count)
                        i += 1
                        self.sleep(msg)
                    urls = self.clean_urls(urls)
                    logging.info('enqueue urls... %d', len(urls))
                    if i == 0:
                        projects.enqueue(self.redis, self.pname, self.db,
                                         self.max_job_count, self.interval,
                                         self.wait, urls, response,
                                         referer=self.referer,
                                         qname=self.qname, tag=tag,
                                         debug=self.debug)
                    logging.info('done!')

        except Exception, e:
            logging.warning(str(e))
            importlib.import_module('mylib.logger').sentry()
コード例 #2
0
    def main(self, url, response, referer=None, tag=0):
        try:
            if not self.status:
                logging.info('status stopped...')
                err = {
                    'code': 500,
                    'url': url,
                    'msg': 'status stopped...',
                    'at': datetime.utcnow()
                }
                raise SpiderError(err)
            self.referer = referer
            logging.info('referer... %s' % referer)
            proxies = select_proxy()
            logging.info('proxy...')
            logging.info(proxies)
            h = self.headers
            logging.info(h)
            logging.info('cookies...')
            c = self.cookies
            logging.info(c)
            resp = requests.get(url, headers=h, cookies=c, proxies=proxies)
            resp.encoding = self.encoding
            self.worker_cookies(resp.cookies)
            resp.headers.update({'Referer': url})
            self.worker_headers(resp.headers)
            logging.info('resp.headers...')
            logging.info(resp.headers)
            if resp.status_code == 404:
                # do nothing
                logging.info('Not found... %d' % resp.status_code)

            elif resp.status_code != 200:
                logging.info('Not 200... %d' % resp.status_code)
                err = {
                    'code': resp.status_code,
                    'url': url,
                    'msg': 'not 200',
                    'at': datetime.utcnow()
                }
                raise SpiderError(err)
            else:
                parsed = urlparse(url)
                logging.info(parsed)
                kwargs = dict(pname=self.pname,
                              url=url,
                              text=resp.text if SAVE_TEXT else '',
                              headers=resp.headers if SAVE_HEADERS else {},
                              encoding=self.encoding,
                              response=response,
                              scheme=parsed.scheme,
                              host=parsed.netloc,
                              query=parsed.query,
                              path=parsed.path)

                logging.info('response tag... %d' % tag)
                soup = BeautifulSoup(resp.text, "lxml")
                tag, urls, opt = response(self, soup, tag, **kwargs)
                kwargs.update({'tag': tag})
                kwargs.update({'option': opt})
                self.visited(url, kwargs)
                logging.info('urls... %d', len(urls))
                if not urls:
                    logging.info('No urls...')
                else:
                    i = 0
                    while len(self.jobs) > self.max_job_count:
                        if i == 10:
                            break
                        msg = 'busy... %d > %d' % \
                            (len(self.jobs), self.max_job_count)
                        i += 1
                        self.sleep(msg)
                    urls = self.clean_urls(urls)
                    logging.info('enqueue urls... %d', len(urls))
                    if i == 0:
                        projects.enqueue(self.redis,
                                         self.pname,
                                         self.db,
                                         self.max_job_count,
                                         self.interval,
                                         self.wait,
                                         urls,
                                         response,
                                         referer=self.referer,
                                         qname=self.qname,
                                         tag=tag,
                                         debug=self.debug)
                    logging.info('done!')

        except Exception, e:
            logging.warning(str(e))
            importlib.import_module('mylib.logger').sentry()
コード例 #3
0
ファイル: spider.py プロジェクト: JFK/spider
        if args.p and args.stop:
            update_worker(args.p, 'status', 0)

        if args.p and args.start:
            update_worker(args.p, 'status', 1)

        elif args.p and args.max_job_count:
            update_worker(args.p, 'max_job_count', int(args.max_job_count))

        elif args.p and args.wait:
            update_worker(args.p, 'wait', int(args.wait))

        else:

            max_job_count = getattr(m, 'MAX_JOB_COUNT')
            wait = getattr(m, 'WAIT')
            url = getattr(m, 'BASE_URL')
            interval = getattr(m, 'INTERVAL')
            redis = {
                'HOST': rq.REDIS_HOST,
                'PORT': rq.REDIS_PORT,
                'DB': rq.REDIS_DB
            }
            debug = args.debug
            projects.enqueue(redis, args.p, DB, max_job_count, interval,
                             wait, [url], response, qname=args.q,
                             debug=args.debug)

    except:
        importlib.import_module('mylib.logger').sentry(debug=debug)
コード例 #4
0
        elif args.p and args.max_job_count:
            update_worker(args.p, 'max_job_count', int(args.max_job_count))

        elif args.p and args.wait:
            update_worker(args.p, 'wait', int(args.wait))

        else:

            max_job_count = getattr(m, 'MAX_JOB_COUNT')
            wait = getattr(m, 'WAIT')
            url = getattr(m, 'BASE_URL')
            interval = getattr(m, 'INTERVAL')
            redis = {
                'HOST': rq.REDIS_HOST,
                'PORT': rq.REDIS_PORT,
                'DB': rq.REDIS_DB
            }
            debug = args.debug
            projects.enqueue(redis,
                             args.p,
                             DB,
                             max_job_count,
                             interval,
                             wait, [url],
                             response,
                             qname=args.q,
                             debug=args.debug)

    except:
        importlib.import_module('mylib.logger').sentry(debug=debug)