def make_request(self, reqtype='regular', **kwargs): if 'url' in kwargs: kwargs['url'] = self.make_url(kwargs['url']) # Handle the requests. # If you need to bypass DDoS protection, put it in here. if reqtype is 'dologin': req = self.craft_login_request_from_form(kwargs['response']) req.meta['shared'] = False req.priority = 10 elif reqtype is 'loginpage': req = Request(self.make_url('loginpage'), dont_filter=True) req.meta['shared'] = False req.priority = 15 req.dont_filter = True elif reqtype is 'regular': req = Request(kwargs['url'], headers=self.user_agent) req.meta['shared'] = True # Some meta-keys that are shipped with the request. if 'relativeurl' in kwargs: req.meta['relativeurl'] = kwargs['relativeurl'] if 'dont_filter' in kwargs: req.dont_filter = kwargs['dont_filter'] if 'req_once_logged' in kwargs: req.meta['req_once_logged'] = kwargs['req_once_logged'] if 'shared' in kwargs: req.meta['shared'] = kwargs['shared'] req.meta['proxy'] = self.proxy req.meta['slot'] = self.proxy req.meta[ 'reqtype'] = reqtype # We tell the type so that we can redo it if login is required return req
def make_request(self, reqtype='regular', **kwargs): if 'url' in kwargs: kwargs['url'] = self.make_url(kwargs['url']) if reqtype == 'index': req = Request(self.make_url('index'), headers=self.tor_browser) req.meta['shared'] = False req.dont_filter = True elif reqtype == 'loginpage': req = Request(self.make_url('login'), headers=self.tor_browser) req.dont_filter = True req.meta['shared'] = False elif reqtype == 'dologin': req = self.request_from_login_page(kwargs['response']) req.dont_filter = True req.meta['shared'] = False elif reqtype == 'captcha_img': req = Request(kwargs['url'], headers=self.tor_browser) req.dont_filter = True req.meta['shared'] = False elif reqtype in ['image']: req = Request(self.make_url(kwargs['url']), headers=self.tor_browser) req.meta['shared'] = True elif reqtype == 'regular': req = Request(self.make_url(kwargs['url']), headers=self.tor_browser) req.meta['shared'] = True # Set sharing. if 'shared' in kwargs: req.meta['shared'] = kwargs['shared'] elif reqtype == 'regular': req.meta['shared'] = True else: req.meta['shared'] = False # Using kwargs you can set a regular request to not being shared. if 'dont_filter' in kwargs: req.dont_filter = kwargs['dont_filter'] if 'priority' in kwargs: req.priority = kwargs['priority'] if 'req_once_logged' in kwargs: req.meta['req_once_logged'] = kwargs['req_once_logged'] # Some default'ish options. req.meta['reqtype'] = reqtype req.meta['proxy'] = self.proxy req.meta['slot'] = self.proxy # return req return self.set_priority(req)
def make_request(self, reqtype, **kwargs): if 'url' in kwargs: kwargs['url'] = self.make_url(kwargs['url']) if reqtype == 'index': req = Request(self.make_url('index')) req.dont_filter = True elif reqtype == 'dologin': data = { 'login': self.login['username'], 'register': '0', 'password': self.login['password'], 'cookie_check': '1', '_xfToken': "", 'redirect': self.resource('index') } if 'captcha_question_hash' in kwargs: data['captcha_question_hash'] = kwargs['captcha_question_hash'] if 'captcha_question_answer' in kwargs: data['captcha_question_answer'] = kwargs[ 'captcha_question_answer'] req = FormRequest(self.make_url('login-postform'), formdata=data, callback=self.handle_login_response, dont_filter=True) #req.method = 'POST' # Has to be uppercase ! req.meta['req_once_logged'] = kwargs['req_once_logged'] req.dont_filter = True elif reqtype in ['threadlisting', 'userprofile']: req = Request(kwargs['url']) req.meta['shared'] = True elif reqtype == 'threadpage': req = Request(kwargs['url']) req.meta['threadid'] = kwargs['threadid'] req.meta['shared'] = True else: raise Exception('Unsuported request type ' + reqtype) req.meta[ 'reqtype'] = reqtype # We tell the type so that we can redo it if login is required req.meta['proxy'] = self.proxy #meta[proxy] is handled by scrapy. return req
def make_request(self, reqtype, **kwargs): if 'url' in kwargs: kwargs['url'] = self.make_url(kwargs['url']) if reqtype == 'index': req = Request(self.make_url('index')) if 'donotparse' in kwargs: req.meta['donotparse'] = True req.dont_filter = True elif reqtype == 'captcha_img': req = Request(kwargs['url']) req.dont_filter = True elif reqtype == 'dologin': req = self.create_request_from_login_page(kwargs['response']) req.meta['req_once_logged'] = kwargs['req_once_logged'] req.dont_filter = True elif reqtype in [ 'ads_list', 'ads', 'ads_ratings', 'user', 'image', 'user_ratings', 'ads_images' ]: req = Request(self.make_url(kwargs['url'])) req.meta['shared'] = True if reqtype == 'ads': req.meta['product_rating_for'] = kwargs['ads_id'] if reqtype == 'user_ratings': req.meta['user_rating_for'] = kwargs['username'] req.meta['username'] = kwargs['username'] if reqtype == 'ads_ratings': req.meta['ads_rating_for'] = kwargs['ads_id'] req.meta['ads_id'] = kwargs['ads_id'] req.meta[ 'reqtype'] = reqtype # We tell the type so that we can redo it if login is required req.meta['proxy'] = self.proxy #meta[proxy] is handled by scrapy. req.meta['slot'] = self.proxy if 'priority' in kwargs: req.priority = kwargs['priority'] if 'accepted_currencies' in kwargs: req.meta['accepted_currencies'] = kwargs['accepted_currencies'] if 'sublisting_quantity' in kwargs: req.meta['sublisting_quantity'] = kwargs['sublisting_quantity'] return req
def make_request(self, reqtype, **kwargs): passthru = ['category', 'escrow', 'username'] if 'url' in kwargs: kwargs['url'] = self.make_url(kwargs['url']) if reqtype == 'index': req = Request(self.make_url('index')) req.dont_filter = True elif reqtype == 'dologin': req = self.craft_login_request_from_form(kwargs['response']) req.dont_filter = True elif reqtype == 'captcha': req = Request(self.make_url(kwargs['url'])) req.dont_filter = True elif reqtype == 'image': req = Request(self.make_url(kwargs['url'])) elif reqtype in [ 'category', 'product', 'userprofile', 'userproduct', 'userpgp', 'userfeedback' ]: req = Request(self.make_url(kwargs['url'])) req.meta['shared'] = True else: raise Exception('Unsuported request type %s ' % reqtype) req.meta[ 'reqtype'] = reqtype # We tell the type so that we can redo it if login is required req.meta['proxy'] = self.proxy #meta[proxy] is handled by scrapy. if 'priority' in kwargs: req.priority = kwargs['priority'] if 'dont_filter' in kwargs: req.dont_filter = kwargs['dont_filter'] if reqtype == 'userfeedback': req.meta['user_rating_for'] = kwargs['username'] for k in passthru: if k in kwargs: req.meta[k] = kwargs[k] if 'req_once_logged' in kwargs: req.meta['req_once_logged'] = kwargs['req_once_logged'] return req
def make_request(self, reqtype='regular', **kwargs): if 'url' in kwargs: kwargs['url'] = self.make_url(kwargs['url']) # Handle the requests. # If you need to bypass DDoS protection, put it in here. if reqtype is 'dologin': req = self.craft_login_request_from_form(kwargs['response']) req.dont_filter = True elif reqtype is 'loginpage': req = Request(self.make_url('loginpage'), dont_filter=True, headers=self.tor_browser) elif reqtype is 'regular': req = Request(kwargs['url'], headers=self.tor_browser) req.meta[ 'shared'] = True # Ensures that requests are shared among spiders. # Some meta-keys that are shipped with the request. if 'relativeurl' in kwargs: req.meta['relativeurl'] = kwargs['relativeurl'] if 'dont_filter' in kwargs: req.dont_filter = kwargs['dont_filter'] if 'req_once_logged' in kwargs: req.meta['req_once_logged'] = kwargs['req_once_logged'] req.meta['proxy'] = self.proxy req.meta['slot'] = self.proxy return self.set_priority(req)
def make_request(self, reqtype='regular', **kwargs): if 'url' in kwargs: kwargs['url'] = self.make_url(kwargs['url']) # Handle the requests. if reqtype is 'dologin': req = self.craft_login_request_from_form(kwargs['response']) req.dont_filter = True elif reqtype is 'loginpage': req = Request(self.make_url('loginpage'), dont_filter=True, headers=self.tor_browser) elif reqtype is 'regular': req = Request(kwargs['url'], headers=self.tor_browser) req.meta['shared'] = True if 'relativeurl' in kwargs: req.meta['relativeurl'] = kwargs['relativeurl'] if 'dont_filter' in kwargs: req.dont_filter = kwargs['dont_filter'] if 'req_once_logged' in kwargs: req.meta['req_once_logged'] = kwargs['req_once_logged'] req.meta['proxy'] = self.proxy req.meta['slot'] = self.proxy req.meta['reqtype'] = reqtype return self.set_priority(req)
def _fetch_in_reactor(url, spider_cls=DefaultSpider, **kwargs): """Fetches an URL and returns the response. Parameters ---------- url : str An URL to fetch. spider_cls : scrapy.Spider (default: DefaultSpider) A spider class to be used in the crawler. kwargs : dict, optional Additional arguments to be passed to ``_run_spider_in_reactor``. Returns ------- crochet.EventualResult """ def parse(self, response): self.response = response req = Request(url) if isinstance(url, six.string_types) else url req.dont_filter = True req.meta['handle_httpstatus_all'] = True spider_cls = override_start_requests(spider_cls, [req], parse=parse) return _run_spider_in_reactor(spider_cls, **kwargs)
def make_request(self, reqtype='regular', **kwargs): if 'url' in kwargs and reqtype != 'captcha': kwargs['url'] = self.make_url(kwargs['url']) if reqtype is 'dologin': req = self.do_login(kwargs['response']) elif reqtype is 'regular': req = Request(kwargs['url']) req.meta["shared"] = True elif reqtype is 'captcha': captcha_full_url = self.spider_settings["endpoint1"] + \ kwargs['url'] req = Request(captcha_full_url) elif reqtype is 'loginpage': login_url = self.spider_settings["endpoint1"] + "login" req = Request(login_url, dont_filter=True) elif reqtype is 'forum_home': req = Request(self.spider_settings["endpoint"]) # Some meta-keys that are shipped with the request. if 'dont_filter' in kwargs: req.dont_filter = kwargs['dont_filter'] if 'shared' in kwargs: req.meta['shared'] = kwargs['shared'] if 'req_once_logged' in kwargs: req.meta['req_once_logged'] = kwargs['req_once_logged'] req.meta['proxy'] = self.proxy req.meta['slot'] = self.proxy # We tell the type so that we can redo it if login is required req.meta['reqtype'] = reqtype return req
def make_request(self, reqtype, **kwargs): if 'url' in kwargs: kwargs['url'] = self.make_url(kwargs['url']) if reqtype == 'index': req = Request(self.make_url('index'), dont_filter=True) elif reqtype == 'loginpage': req = Request(self.make_url('loginpage'), dont_filter=True) elif reqtype == 'dologin': req = self.craft_login_request_from_form(kwargs['response']) req.dont_filter = True elif reqtype == 'captcha_img': req = Request(self.make_url(kwargs['url']), dont_filter=True) elif reqtype in ['threadlisting', 'thread', 'userprofile']: req = Request(self.make_url(kwargs['url'])) req.meta['shared'] = True if 'relativeurl' in kwargs: req.meta['relativeurl'] = kwargs['relativeurl'] else: raise Exception('Unsuported request type ' + reqtype) req.meta[ 'reqtype'] = reqtype # We tell the type so that we can redo it if login is required req.meta['proxy'] = self.proxy #meta[proxy] is handled by scrapy. if 'req_once_logged' in kwargs: req.meta['req_once_logged'] = kwargs['req_once_logged'] return req
def make_request(self, reqtype, **kwargs): passthru_kwargs = ['category', 'relativeurl'] if 'url' in kwargs: kwargs['url'] = self.make_url(kwargs['url']) if reqtype == 'index': req = Request(self.make_url('index')) req.dont_filter = True elif reqtype == 'loginpage': req = Request(self.make_ur('login')) req.dont_filter = True elif reqtype == 'dologin_username': req = req = self.craft_login_username_request_from_form( kwargs['response']) req.dont_filter = True elif reqtype == 'dologin_password': req = req = self.craft_login_password_request_from_form( kwargs['response']) req.dont_filter = True elif reqtype == 'image': req = Request(url=kwargs['url']) if 'referer' in kwargs: req.headers['Referer'] = kwargs['referer'] elif reqtype in ['category', 'listing', 'userprofile']: req = Request(url=kwargs['url']) req.meta['shared'] = True else: raise Exception('Unsuported request type %s ' % reqtype) for arg in passthru_kwargs: if arg in kwargs: req.meta[arg] = kwargs[arg] if reqtype == 'listing': req.meta['product_rating_for'] = kwargs['ads_id'] req.meta[ 'reqtype'] = reqtype # We tell the type so that we can redo it if login is required req.meta['proxy'] = self.proxy # meta[proxy] is handled by scrapy. if 'req_once_logged' in kwargs: req.meta['req_once_logged'] = kwargs['req_once_logged'] return req
def process_response(self, request: Request, response, spider): # 使用实例 # print('当前请求ip:', request.meta.get('proxy')) # spider.logger.info('输出地址 {}'.format(response.url)) # response.text if response.status == 200: if "快捷登录" not in response.text or "密码登录" not in response.text: # print(response.text) return response else: logger.debug("页面数据异常,再次尝试请求......") request.dont_filter = True time.sleep(1) return request else: logger.debug("请求出错,再次尝试请求......") request.dont_filter = True return request
def make_request(self, reqtype, **kwargs): if 'url' in kwargs: kwargs['url'] = self.make_url(kwargs['url']) if reqtype == 'index': req = Request(self.make_url('index')) req.dont_filter=True elif reqtype == 'loginpage': req = Request(self.make_url('loginpage')) req.dont_filter=True elif reqtype == 'dologin': req = self.craft_login_request_from_form(kwargs['response']) req.dont_filter=True elif reqtype == 'captcha': req = Request(self.make_url(kwargs['url'])) req.dont_filter=True elif reqtype=='ddos_protection': req = self.create_request_from_ddos_protection(kwargs['response']) req.dont_filter=True elif reqtype in ['category', 'listing', 'userprofile', 'listing_feedback', 'user_feedback', 'image']: req = Request(self.make_url(kwargs['url'])) req.meta['shared'] = True else: raise Exception('Unsuported request type %s ' % reqtype) if reqtype == 'listing_feedback': req.meta['product_rating_for'] = kwargs['listing_id'] elif reqtype == 'user_feedback': req.meta['user_rating_for'] = kwargs['username'] req.meta['reqtype'] = reqtype # We tell the type so that we can redo it if login is required req.meta['proxy'] = self.proxy #meta[proxy] is handled by scrapy. if 'req_once_logged' in kwargs: req.meta['req_once_logged'] = kwargs['req_once_logged'] if reqtype == 'user_feedback': # Disabled user feedback because it is redundant with ads_feedback return None return req
def make_request(self, **kwargs): if 'url' in kwargs: kwargs['url'] = self.make_url(kwargs['url']) req = Request(kwargs['url']) if 'dont_filter' in kwargs: req.dont_filter = kwargs['dont_filter'] req.meta['proxy'] = self.proxy #meta[proxy] is handled by scrapy. req.meta['slot'] = self.proxy return req
def create_spider_request(self, kwargs): url = kwargs.pop('url') try: req = Request(url, **kwargs) except (TypeError, ValueError) as e: msg = "Error while creating Scrapy Request, {}" message = msg.format(str(e)) raise Error('400', message=message) req.dont_filter = True msg = u"Created request for spider {} with url {} and kwargs {}" msg = msg.format(self.spider_name, url, repr(kwargs)) log.msg(msg) return req
def create_spider_request(self, kwargs): url = kwargs.pop('url') try: req = Request(url, **kwargs) except (TypeError, ValueError) as e: # Bad arguments for scrapy Request # we don't want to schedule spider if someone # passes meaingless arguments to Request. # We must raise this here so that this will be returned to client, # Otherwise if this is raised in spider_opened it goes to # spider logs where it does not really belong. # It is needed because in POST handler we can pass # all possible requests kwargs, so it is easy to make mistakes. message = "Error while creating Request, {}".format(e.message) raise Error('400', message=message) req.dont_filter = True msg = u"Created request for spider {} with url {} and kwargs {}" msg = msg.format(self.spider_name, url, repr(kwargs)) log.msg(msg) return req
def create_spider_request(self, kwargs): url = kwargs.pop('url') try: req = Request(url, **kwargs) except (TypeError, ValueError) as e: # Bad arguments for scrapy Request # we don't want to schedule spider if someone # passes meaingless arguments to Request. # We must raise this here so that this will be returned to client, # Otherwise if this is raised in spider_idle it goes to # spider logs where it does not really belong. # It is needed because in POST handler we can pass # all possible requests kwargs, so it is easy to make mistakes. message = "Error while creating Request, {}".format(e.message) raise Error('400', message=message) req.dont_filter = True msg = u"Created request for spider {} with url {} and kwargs {}" msg = msg.format(self.spider_name, url, repr(kwargs)) log.msg(msg) return req
def start_requests(self): docs = self.db_adapter.get_videos(self._item) for doc in docs: site_url = doc.get('site_url', '') parse_method = self._get_parse_method(site_url) if not parse_method: continue url = doc.get('url', '') if not url: continue query = doc.get('query', '') item = VideoZjcmItem(doc=doc, next_request=None, list_url='', query=query, attachments=[], attachment_urls=[]) meta = { 'item': item } request = Request(url, callback=parse_method, meta=meta) # noinspection PyUnresolvedReferences request.dont_filter = True item['next_request'] = request yield self.item_or_request(item)
def make_request(self, reqtype, **kwargs): if 'url' in kwargs: kwargs['url'] = self.make_url(kwargs['url']) if 'redirect_from' in kwargs: req = Request(kwargs['url'], headers=self.user_agent) req.meta['redirect_from'] = kwargs['redirect_from'] req.dont_filter = True elif reqtype == 'index': req = Request(self.make_url('index'), headers=self.user_agent) req.dont_filter = True elif reqtype == 'ddos_protection': req = self.create_request_from_ddos_protection(kwargs['response']) req.meta['ddos_protection'] = True req.dont_filter = True elif reqtype == 'captcha': req = Request(kwargs['url'], headers=self.user_agent) req.dont_filter = True elif reqtype == 'dologin': req = self.create_request_from_login_page(kwargs['response']) req.dont_filter = True elif reqtype in ['threadlisting', 'thread']: req = Request(kwargs['url'], headers=self.user_agent) req.dont_filter = False req.meta['shared'] = True if reqtype == 'threadlisting': req.priority = 10 req.meta['reqtype'] = reqtype # We tell the type so that we can redo it if login is required req.meta['proxy'] = self.proxy # meta[proxy] is handled by scrapy. if 'priority' in kwargs: req.priority = kwargs['priority'] if 'req_once_logged' in kwargs: req.meta['req_once_logged'] = kwargs['req_once_logged'] if 'shared' in kwargs: req.meta['shared'] = kwargs['shared'] elif 'shared' not in kwargs: req.meta['shared'] = False return req
def retry_request_with_get(self, request: Request) -> Iterator[Request]: request.method = 'GET' request.dont_filter = True yield request
def make_request(self, reqtype, **kwargs): passthru = ['category'] if 'url' in kwargs: kwargs['url'] = self.make_url(kwargs['url']) if reqtype == 'index': req = Request(self.make_url('index')) req.dont_filter = True elif reqtype == 'loginpage': req = Request(self.make_url('loginpage')) req.dont_filter = True elif reqtype == 'dologin': req = self.craft_login_request_from_form(kwargs['response']) req.dont_filter = True elif reqtype == 'captcha': req = Request(self.make_url(kwargs['url'])) req.dont_filter = True elif reqtype == 'image': req = Request(self.make_url(kwargs['url'])) elif reqtype == 'ddos_protection': req = self.create_request_from_ddos_protection(kwargs['response']) req.dont_filter = True elif reqtype == 'security_check': req = self.create_request_from_security_check(kwargs['response']) req.dont_filter = True elif reqtype == 'category': req = FormRequest.from_response(kwargs['response'], formcss=kwargs['formcss'], clickdata=kwargs['clickdata']) elif reqtype == 'category-page': # Changing page is done with a POST form. btn = kwargs['btn'] name = btn.xpath('@name').extract_first() # "page" val = btn.xpath('@value').extract_first() # page number data = { name: val, 'dofilter': '0' } # Careful, if dofilter is set to 1 (default value), page will be empty req = FormRequest.from_response( kwargs['response'], formdata=data, formxpath='//*[contains(@class, "pagination")]/ancestor::form') if req.url in self.yielded_category_page: if val in self.yielded_category_page[req.url]: return None else: self.yielded_category_page[req.url].append(val) else: self.yielded_category_page[req.url] = [] elif reqtype in ['category', 'userprofile', 'offer', 'offer-refund']: req = Request(self.make_url(kwargs['url'])) req.meta['shared'] = True else: raise Exception('Unsuported request type %s ' % reqtype) req.meta[ 'reqtype'] = reqtype # We tell the type so that we can redo it if login is required req.meta['proxy'] = self.proxy #meta[proxy] is handled by scrapy. if 'priority' in kwargs: req.priority = kwargs['priority'] if reqtype == 'offer': offer_id = self.get_offer_id_from_url(req.url) req.meta['product_rating_for'] = offer_id for k in passthru: if k in kwargs: req.meta[k] = kwargs[k] if 'req_once_logged' in kwargs: req.meta['req_once_logged'] = kwargs['req_once_logged'] return req