def process_topic(self, item): """Dump topic to file.""" def prepend_domain(endpoint): return item['domain'][:-1] + endpoint if endpoint is not None else None # fetch topic pdf, docx urls doc_urls = item['doc_urls'] pdf_url = docx_url = None for d in doc_urls: _, ext = splitext(basename(urlsplit(d).path)) if ext.lower() == '.pdf': if pdf_url is not None: raise NotSupported('multiple pdf urls for source url: {}'.format(item['source_url'])) pdf_url = prepend_domain(d.strip()) elif ext.lower() == '.docx': docx_url = prepend_domain(d.strip()) else: raise NotSupported('unsupported docx url file type: {}'.format(d, ext)) data = { 'id': item['id'], 'source_url': item['source_url'], 'chapter_num': item['chapter_num'], 'pdf_url': pdf_url, 'docx_url': docx_url, # the following fields can be empty. # some topics only link to pdf/docx with no other content. 'title': ( self.html_parser.unescape(item['title'].strip()) if item['title'] is not None else None ), 'office': ( self.html_parser.unescape(item['office'].strip()) if item['office'] is not None else None ), 'body': ( self.html_parser.unescape(item['body'].strip()) if item['body'] is not None else None ), } self.exporters['topics'].export_item(data) return item
def preserve(self, ad: JSONObject) -> None: self.logger.debug(ad) ad['isDetailed'] = False if ad['type'] == 'item' or ad['type'] == 'xlItem': timestamp = ad['value']['time'] id = ad['value']['id'] elif ad['type'] == 'vip': timestamp = ad['value']['list'][0]['value']['time'] id = ad['value']['list'][0]['value']['id'] else: raise NotSupported() if self.last_stamp == timestamp: self.page += 1 else: self.last_stamp = timestamp self.page = 1 if self.recent_collection.collection.find_one({'value.id': id}): self.broken_ads += 1 self.broken_ads_in_a_row += 1 else: self.broken_ads_in_a_row = 0 self.recent_collection.collection.insert_one(ad) if self.broken_ads_in_a_row > BROKEN_ADS_THRESHOLD: raise CloseSpider("Broken Ads threshold excedeed")
async def download_request(self, request, spider): scheme = urlparse_cached(request).scheme handler = self._get_handler(scheme) if not handler: raise NotSupported("Unsupported URL scheme '%s': %s" % (scheme, self._notconfigured[scheme])) return await handler.download_request(request, spider)
def download_request(self, request, spider): scheme = urlparse_cached(request).scheme # 找到下载类型,比如http类型 handler = self._get_handler(scheme) # 根据类型找到对应的handler if not handler: raise NotSupported("Unsupported URL scheme '%s': %s" % (scheme, self._notconfigured[scheme])) return handler.download_request(request, spider)
def download_request(self, request, spider): scheme = urlparse_cached(request).scheme handler = self._get_handler(scheme) if not handler: raise NotSupported( f"Unsupported URL scheme '{scheme}': {self._notconfigured[scheme]}" ) return handler.download_request(request, spider)
def download_request(self, request, spider): scheme = urlparse_cached(request).scheme handler = self._get_handler(scheme) # print(handler,'###############') # <scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler object at 0x04697CD0> if not handler: raise NotSupported("Unsupported URL scheme '%s': %s" % (scheme, self._notconfigured[scheme])) return handler.download_request(request, spider)
def resolve_item_type(cls, document: JSONObject) -> Type[Enum]: if document['type'] == 'item': return cls['ITEM'] elif document['type'] == 'xlItem': return cls['XLITEM'] elif document['type'] == 'vip': return cls['VIP'] else: raise NotSupported()
def resolve_item_value(document: JSONObject) -> JSONObject: """Resolves item value""" assert document is not None if document['type'] == 'item' or document['type'] == 'xlItem': return document['value'] elif document['type'] == 'vip': return document['value']['list'][0]['value'] else: raise NotSupported()
def download_request(self, request, spider): scheme = urlparse_cached(request).scheme try: handler = self._handlers[scheme] except KeyError: msg = self._notconfigured.get(scheme, \ 'no handler available for that scheme') raise NotSupported("Unsupported URL scheme '%s': %s" % (scheme, msg)) return handler(request, spider)
def _add_middleware(self, mw): if hasattr(mw, 'open_spider'): self.methods['open_spider'].append(mw.open_spider) if hasattr(mw, 'close_spider'): if self.close_spider_order == 'numerical': self.methods['close_spider'].append(mw.close_spider) elif self.close_spider_order == 'default': self.methods['close_spider'].insert(0, mw.close_spider) else: raise NotSupported('CLOSESPIDER_CALLING_ORDER setting has to be either "default" or "numerical"')
def download_request(self, request, spider): ## 获取请求的协议 scheme = urlparse_cached(request).scheme ## 获取协议对应的下载处理器 handler = self._get_handler(scheme) if not handler: raise NotSupported("Unsupported URL scheme '%s': %s" % (scheme, self._notconfigured[scheme])) ## 通过下载处理器下载请求,并返回响应内容 return handler.download_request(request, spider)
def _connect(self, factory): host, port = factory.host, factory.port if factory.scheme == 'https': if ssl_supported: return reactor.connectSSL(host, port, factory, \ self.ClientContextFactory()) raise NotSupported( "HTTPS not supported: install pyopenssl library") else: return reactor.connectTCP(host, port, factory)
def _check_crawlera_settings(self, splash_settings): # When using crawlera with splash we use a script to configure it # properly, but that means that some options that can be used in # render.html can't be used anymore. if splash_settings.get('endpoint', 'render.html') != 'render.html': raise NotSupported("Splash + Crawlera integration is only " "implemented for the render.html endpoint") splash_args = splash_settings.get('args', {}) not_implemented_options = { # option: (allowed values, ...) 'js': (None, ''), 'allowed_content_types': (None, ''), 'forbidden_content_types': (None, ''), } for option, allowed in not_implemented_options.items(): if option in splash_args and splash_args[option] not in allowed: raise NotSupported( "Splash option '%s' is not compatible with Crawlera" % option )
def download_request(self, request, spider): # 1. 获取schedule(http/https) scheme = urlparse_cached(request).scheme # 2. 根据schedule选择加载器(http, https, ftp等) handler = self._get_handler(scheme) if not handler: raise NotSupported("Unsupported URL scheme '%s': %s" % (scheme, self._notconfigured[scheme])) # 3. 开始下载并返回结果, 例如scrapy/core/downloader/handlers/http11.py # a. 下载异常, 会依次调用各个中间件process_exception方法, 注意, 处理process_exception # 和process_response是倒序的(见scrapy/core/downloader/middleware.py). # b. 下载成功, 一步步返回到scrapy/core/engine.py中的_next_request_from_scheduler方法 return handler.download_request(request, spider)
def getProxyPool(self, drop_list): with pymysql.connect(**self.dbparm) as cursor: if len(drop_list) > 0: format_str = ','.join(['%s'] * len(drop_list)) sql = 'update python.proxytable set valid = 1 where address in({})'.format( format_str) cursor.execute(sql, tuple(drop_list)) row_count = cursor.execute( 'select address from python.proxytable' ' where valid = 0 order by rank,update_time desc limit 10') if row_count > 0: result = [item[0] for item in cursor.fetchall()] return result else: self.proxy_enabled = False raise NotSupported('proxy resources are exhausted')
def _handle_page_load(self, request, webpage, cookiejar, load_result): if cookiejar: yield sync_cookies(cookiejar, webpage) browser_response = request.meta.get('browser_response', False) try: ok, status, headers, exc = load_result if ok: if browser_response: respcls = BrowserResponse else: respcls = HtmlResponse url = yield webpage.callRemote('get_url') encoding, body = yield webpage.callRemote('get_body') response = respcls(status=status, url=url, headers=headers, body=body, encoding=encoding, request=request) if browser_response: response._webpage = PBReferenceMethodsWrapper(webpage) response._semaphore = self._semaphore response._cookiejar = cookiejar else: if isinstance(exc, ScrapyNotSupported): exc = NotSupported(*exc.args) raise exc except Exception as err: browser_response = False response = Failure(err) finally: if not browser_response: try: yield webpage.callRemote('close') finally: self._semaphore.release() return response
def parse_XML(self, response): if not hasattr(self, 'parse_node'): raise NotConfigured('You must define parse_node method in order to scrape this XML feed') response = self.adapt_response(response) if self.iterator == 'iternodes': nodes = self._iternodes(response) elif self.iterator == 'xml': selector = Selector(response, type='xml') self._register_namespaces(selector) nodes = selector.xpath('//%s' % self.itertag) elif self.iterator == 'html': selector = Selector(response, type='html') self._register_namespaces(selector) nodes = selector.xpath('//%s' % self.itertag) else: raise NotSupported('Unsupported node iterator') return self.parse_nodes(response, nodes)
def _handle_page_load(self, request, webpage, load_result=(True, 200, None, None)): """ Handle a request for a web page, either a page load or a request to continue using an existing page object. """ try: ok, status, headers, exc = load_result if ok: url = yield webpage.callRemote('get_url') browser_response = request.meta.get('browser_response', False) if browser_response: respcls = BrowserResponse else: respcls = HtmlResponse encoding, body = yield webpage.callRemote('get_body') response = respcls(status=status, url=url, headers=headers, body=body, encoding=encoding, request=request) if browser_response: response.webpage = PbReferenceMethodsWrapper(webpage) else: if isinstance(exc, ScrapyNotSupported): exc = NotSupported(*exc.args) raise exc except Exception as err: response = Failure(err) return response
def _parse(self, response, **kwargs):#当程序从父类的start_request 接受到respond后 # 根据不同的解析器 解析respond 调用parse_node 返回结果 if not hasattr(self, 'parse_node'): raise NotConfigured('You must define parse_node method in order to scrape this XML feed') response = self.adapt_response(response) #在调用解析器前 调用 self.adapt_response if self.iterator == 'iternodes': nodes = self._iternodes(response) elif self.iterator == 'xml': selector = Selector(response, type='xml') self._register_namespaces(selector) nodes = selector.xpath(f'//{self.itertag}') elif self.iterator == 'html': selector = Selector(response, type='html') self._register_namespaces(selector) nodes = selector.xpath(f'//{self.itertag}') else: raise NotSupported('Unsupported node iterator') return self.parse_nodes(response, nodes)
def _parse(self, response, **kwargs): if not hasattr(self, "parse_node"): raise NotConfigured( "You must define parse_node method in order to scrape this XML feed" ) response = self.adapt_response(response) if self.iterator == "iternodes": nodes = self._iternodes(response) elif self.iterator == "xml": selector = Selector(response, type="xml") self._register_namespaces(selector) nodes = selector.xpath(f"//{self.itertag}") elif self.iterator == "html": selector = Selector(response, type="html") self._register_namespaces(selector) nodes = selector.xpath(f"//{self.itertag}") else: raise NotSupported("Unsupported node iterator") return self.parse_nodes(response, nodes)
def start_requests(self): channel_number = self.settings.get('CHANNEL_NUMBER') channel_list = [] base_url = 'https://www.pornhubpremium.com/channels/{0}/videos?o=ra' with open('channel.txt') as f: for channel in f: channel = channel.strip() if channel != '': channel_list.append(channel) # check CHANNEL_NUMBER is smaller than list length if isinstance(channel_number, int) and len(channel_list) < channel_number: raise NotSupported( 'CHANNEL_NUMBER config is bigger than website channel list') if channel_number == 'ALL': for i in channel_list: yield scrapy.Request(base_url.format(i)) else: for i in range(channel_number): yield scrapy.Request(base_url.format(channel_list[i]))
def _get_web_services(self): for ext in self.crawler.extensions.middlewares: if isinstance(ext, WebService): return "http://{host.host:s}:{host.port:d}".format( host=ext.port.getHost()).encode('utf-8') raise NotSupported('Web Not Supported Please Check!')
def xpath(self, *a, **kw): """Shortcut method implemented only by responses whose content is text (subclasses of TextResponse). """ raise NotSupported("Response content isn't text")
def xpath(self, a: object, kw: object) -> object: """Shortcut method implemented only by responses whose content is text (subclasses of TextResponse). """ raise NotSupported("Response content isn't text")
def __init__(self, *args, **kw): raise NotSupported('HTTP1.1 not supported')
def parse(self, response): goods_all = json.loads(self.json_data) skus = goods_all['skus'] goods_colors = goods_all['goods_colors'] goods = goods_all['goods'] if 'title' not in goods.keys(): raise NotSupported('No title in goods, Pls specify!!') title = ''.join(goods['title'].split()).lower() make_md5 = hashlib.md5() make_md5.update(quote(title, safe='')) show_product_id = make_md5.hexdigest() logging.warning('show_product_id: ' + show_product_id) logging.warning('title: ' + goods['title']) from_site = goods['from_site'] skus_colors = [] skus_sizes = [] for sku in skus: if sku['color'] not in skus_colors: skus_colors.append(sku['color']) skus_sizes.append(sku['size']) if 'goods_current_price' not in dir( ) or goods_current_price > sku['current_price']: goods_current_price = sku['current_price'] if 'goods_list_price' not in dir( ) or goods_list_price < sku['list_price']: goods_list_price = sku['list_price'] sku['type'] = 'sku' sku['id'] = show_product_id + '-' + sku['color'] + '-' + sku['size'] sku['is_outof_stock'] = False sku['from_site'] = from_site sku['show_product_id'] = show_product_id goods_colors_colors = [] for goods_color in goods_colors: if 'name' not in goods_color.keys(): raise NotSupported('ERROR! No name in (color)') goods_colors_colors.append(goods_color['name']) if 'cover' not in goods.keys(): goods['cover'] = goods_color['images'][0]['image'] if 'images' not in goods_color.keys() or len( goods_color['images']) == 0 or ( len(goods_color['images']) == 1 and len(goods_color['images'][0]) == 0): goods_color['images'] = [{'image': goods['cover']}] goods_color['cover'] = goods['cover'] else: goods_color['cover'] = goods_color['images'][0]['image'] goods_color['from_site'] = from_site goods_color['show_product_id'] = show_product_id # if goods_colors_colors != list(set(goods_colors_colors)): # raise NotSupported('ERROR! Dupelicate name in goods_colors') if goods_colors_colors == [] or skus_colors == [] or sorted( skus_colors) != sorted(goods_colors_colors): raise NotSupported('skus_colors: ' + str(skus_colors) + ', ' + 'goods_colors_colors: ' + str(goods_colors_colors) + ', \n' + 'goods_colors not equal sku_colors') for goods_color in goods_colors: colorItem = Color() colorItem = goods_color colorItem['type'] = 'color' yield colorItem item = BaseItem() item = goods item['type'] = 'base' item['product_type'] = 'json_import_' + goods['product_type_id'] item['category'] = 'json_import_' + goods['category_id'] item['product_type_id'] = goods['product_type_id'] item['category_id'] = goods['category_id'] item['title'] = goods['title'] item['show_product_id'] = show_product_id item['from_site'] = from_site item['colors'] = skus_colors item['sizes'] = skus_sizes item['current_price'] = goods_current_price item['list_price'] = goods_list_price if 'groupbuy_num' in goods.keys(): item['groupbuy_num'] = goods['groupbuy_num'] item['skus'] = skus print 'skus', skus if 'desc' in goods.keys(): item['desc'] = goods['desc'] if 'weight' in goods.keys(): item['weight'] = goods['weight'] yield item
def xpath(self, *a, **kw): raise NotSupported("Response content isn't text")