def parse_article(self, response): if all(map(lambda url: url not in response.url, self.disallowed_urls)): article_loader = ArticleLoader(response=response) article_loader.add_value( "code", self.article_code_regex.search(response.url).group(1)) article_loader.add_value("url", response.url) article_loader.add_css("category", "meta[property*=section]::attr(content)") article_info = json.loads( response.css("script[type='application/ld+json']::text"). extract_first(default="{}")) article_loader.add_value("source", article_info, SelectJmes("author.name")) article_loader.add_value("headline", article_info, SelectJmes("headline")) article_loader.add_value("description", article_info, SelectJmes("description")) article_loader.add_value("time", article_info, SelectJmes("dateModified")) article_soup = BeautifulSoup( response.css(".article__body").extract_first(), "lxml") for br_tag in article_soup.find_all("br"): br_tag.replace_with("\n") lines = [] for paragraph_tag in article_soup.find_all("p", class_="body-text"): for line in self.linebreak_regex.split( paragraph_tag.get_text()): lines.append(line.strip()) article_loader.add_value("content", "\n".join(lines)) yield article_loader.load_item()
def parse(self, response): jsonresponse = json.loads(response.body_as_unicode()) # yield jsonresponse for c in jsonresponse['list']: loader = ItemLoader(item=StockCubesItem()) loader.default_input_processor = MapCompose(str) loader.default_output_processor = Join(' ') for (field, path) in self.jmes_paths.items(): loader.add_value(field, SelectJmes(path)(c)) item = loader.load_item() ownerLoader = ItemLoader(item=OwnerItem()) ownerLoader.default_input_processor = MapCompose(str) ownerLoader.default_output_processor = Join(' ') for (field, path) in self.owner_jmes_paths.items(): ownerLoader.add_value(field, SelectJmes(path)(c['owner'])) owner = ownerLoader.load_item() item['owner'] = owner yield item # 开始提取用户信息 uid = owner['id'] # https://stock.xueqiu.com/v5/stock/portfolio/stock/list.json?size=1000&category=3&uid=6626771620&pid=-24(创建的组合) createdCubeUrl = f'https://stock.xueqiu.com/v5/stock/portfolio/stock/list.json?size=1000&category=3&uid={uid}&pid=-24' # 请求用户创建的组合 # 通过cb_kwargs的方式,给解析函数传递参数 yield scrapy.Request( createdCubeUrl, self.parseCubeList, headers=self.send_headers, cb_kwargs=dict(uid=uid, screen_name=owner['screen_name'])) # 请求用户关注的组合,这个地方不去传递uid和screen_name信息,这种情况下,通过请求网页去解析, # TODO 请求网页的速度超慢,想办法优化,开启多线程? followedCubeUrl = f'https://stock.xueqiu.com/v5/stock/portfolio/stock/list.json?size=1000&category=3&uid={uid}&pid=-120' yield scrapy.Request(followedCubeUrl, self.parseCubeList, headers=self.send_headers) # 组合信息: # https://xueqiu.com/cubes/quote.json?code=ZH976766,SP1034535,SP1012810,ZH1160206,ZH2003755,ZH1996976,ZH1079481,ZH1174824,ZH1079472,SP1040320 page = jsonresponse['page'] maxPage = jsonresponse['maxPage'] if (page < maxPage): url = f'{self.cube_discover_url}{page+1}' yield scrapy.Request(url, headers=self.send_headers)
def scrape_pages(self, response): # logging.error("response", response) # print(response) # item = LikefolioItem() # item["data"] = response.xpath("/html/body/pre/text()[1]").get() # item["data"] = response.xpath( # "/html/body/div[4]/div/div/div[2]/div[1]/h4/span[2]" # ) # yield item jsonresponse = json.loads(response.body_as_unicode()) for user in jsonresponse: loader = ItemLoader(item=LikefolioItem()) loader.deafult_input_processor = MapCompose(str) loader.default_output_processor = Join(" ") for (field, path) in self.cpi_paths.items(): loader.add_value(field, SelectJmes(path)(user)) yield loader.load_item
def _get_json_loaders(self, response, field_extractor, item_cls, loader_cls): """对json格式进行解析提取item""" def _generate_loader(d): if not isinstance(d, dict): log.warning("元素不是字典类型,请检查json path的配置语法!") return None loader = loader_cls(item=item_cls()) for k, v in d.items(): loader.add_value(k, v) return loader json_path = field_extractor["json_path"] encoding = field_extractor["encoding"] jn = json.loads(response.text, encoding=encoding) # 若获取的结果一个list, 则list的元素都是具体的item了; # 若获取的结果是一个dict, 则直接是一个item了 results = SelectJmes(json_path=json_path)(jn) if isinstance(results, dict): results = [results] if not isinstance(results, list): log.warning("json 查询语言配置格式不对,请核查json_path:{}".format(json_path)) loaders = [] for d in results: if not d: continue temp_loader = _generate_loader(d) if temp_loader: loaders.append(temp_loader) return loaders
def test_output(self): for l in self.test_list_equals: expr, test_list, expected = self.test_list_equals[l] test = SelectJmes(expr)(test_list) self.assertEqual(test, expected, msg='test "{}" got {} expected {}'.format( l, test, expected))
def parse_cube_info(self, response, symbol_list): json_response = json.loads(response.body_as_unicode()) for s in symbol_list: loader = ItemLoader(item=CubeItem()) loader.default_input_processor = MapCompose(str) loader.default_output_processor = Join(' ') for (field, path) in self.jmes_paths.items(): loader.add_value(field, SelectJmes(path)(json_response[s])) item = loader.load_item() yield item
def parse(self, response): jsonresp = json.loads(response.body_as_unicode())['vaccination_county_condensed_data'] for vacc in jsonresp: loader = ItemLoader(item=CcrawlerItem()) loader.default_input_processor = MapCompose(str) loader.default_output_processor = Join(',') for (field, path) in self.jmes_paths.items(): loader.add_value(field, SelectJmes(path)(vacc)) yield loader.load_item()
def parseCubeInfo(self, response, uid, screen_name, symbolList): jsonresponse = json.loads(response.body_as_unicode()) for s in symbolList: loader = ItemLoader(item=StockCubesItem()) loader.default_input_processor = MapCompose(str) loader.default_output_processor = Join(' ') for (field, path) in self.jmes_paths.items(): loader.add_value(field, SelectJmes(path)(jsonresponse[s])) item = loader.load_item() owner = OwnerItem() owner['id'] = uid owner['screen_name'] = screen_name item['owner'] = owner yield item
def _extract_links(self, json_path, response): # 通过json_path获取具体urls, to add codes here try: # 通过正则匹配得到具体的json内容信息 json_re = response.meta.get('json_re', None) response_text = response.text if json_re: mo = re.search(pattern=json_re, string=response_text, flags=re.S | re.M | re.I) if mo: response_text = mo.group(1) # 因为返回结果为json格式,所以需要先json decode, 有可能发生异常失败 j = json.loads(response_text, encoding='utf-8') except Exception as e: log.error(e) return [] json_func = SelectJmes(json_path) results = json_func(j) if not results: log.warning("json_path:{0} 没有在response中没有匹配到相应的links, 退出!".format( json_path)) return [] links = [] base_url = get_base_url(response) results = arg_to_iter(results) for url_texts in results: try: url = str(url_texts.get('url', '')) if not url: continue url = strip_html5_whitespace(url) url = urljoin(base_url, url) url = self.process_attr(url) if not url: continue url = urljoin(response.url, url) text = url_texts.get('text', '') fragment = str(url_texts.get("fragment", "")) link = Link(url=url, text=text, fragment=fragment) links.append(link) except Exception as e: log.error(e) return self._deduplicate_if_needed(links)
def parse(self, response): """Main parse method.""" jsonresponse = json.loads(response.body_as_unicode()) for user in jsonresponse: loader = ItemLoader( item=UserItem()) # create an ItemLoader to populate a UserItem loader.default_input_processor = MapCompose( str) # apply str conversion on each value loader.default_output_processor = Join(' ') for (field, path) in self.jmes_paths.items(): loader.add_value(field, SelectJmes(path)(user)) yield loader.load_item()
class BaiduItem(scrapy.Item): # define the fields for your item here like: # category = scrapy.Field() # 爬虫类别 # source_web = scrapy.Field() # 爬虫源站点名 # way = scrapy.Field() # 爬虫方式(2搜关键字) meta = scrapy.Field() hash_id = scrapy.Field(key=True) # link的hash值 title = scrapy.Field() # 新闻标题 link = scrapy.Field() # 新闻链接 author = scrapy.Field() # 新闻撰稿者 post_time = scrapy.Field() # 新闻发布时间 # image_urls = scrapy.Field() search_word = scrapy.Field() # 搜索关键字 cover_urls = scrapy.Field() # 封面图片URL content = scrapy.Field( nullable=True, input_processor=MapCompose( SelectJmes( 'data.news[0].content[].{type: type, data: data.original.url || data}' )), output_processor=Identity()) # 新闻内容 无内容的新闻在生成item时过滤
def parse_page(self, response): # extract the json response from the tag: <script id="__NEXT_DATA__" type="application/json”> jsonresponse = json.loads(response.css('#__NEXT_DATA__::text').extract()[0]) # access the historical data within the JSON object nestedJson = jsonresponse['props']['initialState']['cryptocurrency']['ohlcvHistorical'] # retrieve the id of the crypto (a key value) id = [str(k) for k in nestedJson.keys()][0] # get the name of the respective crypto name = nestedJson[id]['name'] # save the ticker symbol ticker = jsonresponse['props']['initialState']['cryptocurrency']['ohlcvHistorical'][id]['symbol'] # accesss the historical data: e.g. Open, Close, High, Low, etc. data = jsonresponse['props']['initialState']['cryptocurrency']['ohlcvHistorical'][id]['quotes'] for d in data: loader = ItemLoader(item=CryptoItem()) loader.default_input_processor = MapCompose(str) loader.default_ouput_processor = Join('') for (field, path) in self.jmes_paths.items(): loader.add_value(field, SelectJmes(path)(d)) loader.add_value("Name", name) loader.add_value("Ticker", ticker) yield loader.load_item()
def parse(self, response): """ 雪球组合发现页面请求之后对内容进行解析 :param response: 请求返回来的json字符串 :return: """ json_response = json.loads(response.body_as_unicode()) for c in json_response['list']: if c['symbol'] == 'ZH696958': c['close_at'] = "1574600533585" # loader = ItemLoader(item=CubeItem()) # loader.default_input_processor = MapCompose(str) # loader.default_output_processor = Join(' ') # # for (field, path) in self.jmes_paths.items(): # loader.add_value(field, SelectJmes(path)(c)) # item = loader.load_item() wrapper_item = CubeItemWrapper.from_dict(c) item = CubeItem() item['symbol'] = c['symbol'] item['data_item'] = wrapper_item owner_loader = ItemLoader(item=OwnerItem()) owner_loader.default_input_processor = MapCompose(str) owner_loader.default_output_processor = Join(' ') for (field, path) in self.owner_jmes_paths.items(): owner_loader.add_value(field, SelectJmes(path)(c['owner'])) owner = owner_loader.load_item() yield item profit_since_time = byte_to_str( self.r.get(f'{item["symbol"]}_profit_since_time')) self.logger.warning( f'redis---> get the profit_since_time: {profit_since_time}') params = '' if profit_since_time: params = f'&since={profit_since_time}&until={int(time.time()) * 1000}' # 请求收益列表 yield scrapy.Request( f'{self.cube_profit_url}{item["symbol"]}{params}', self.parse_cube_profit_list, headers=self.send_headers) # 请求调仓记录 yield scrapy.Request(f'{self.cube_rebalance_url}{item["symbol"]}', self.parse_cube_rebalance_list, headers=self.send_headers, cb_kwargs=dict(symbol=item["symbol"])) """ DELETEME # 开始提取用户信息 uid = owner['id'] # https://stock.xueqiu.com/v5/stock/portfolio/stock/list.json?size=1000&category=3&uid=6626771620&pid=-24(创建的组合) created_cube_url = f'https://stock.xueqiu.com/v5/stock/portfolio/stock/list.json?size=1000&category=3&uid={uid}&pid=-24' # 请求用户创建的组合 # 通过cb_kwargs的方式,给解析函数传递参数 yield scrapy.Request(created_cube_url, self.parse_cube_list, headers=self.send_headers, cb_kwargs=dict(uid=uid, screen_name=owner['screen_name'])) # 请求用户关注的组合,这个地方不去传递uid和screen_name信息,这种情况下,通过请求网页去解析, # TODO 请求网页的速度超慢,想办法优化,开启多线程? followed_cube_url = f'https://stock.xueqiu.com/v5/stock/portfolio/stock/list.json?size=1000&category=3&uid={uid}&pid=-120' yield scrapy.Request(followed_cube_url, self.parse_cube_list, headers=self.send_headers) # 组合信息: # https://xueqiu.com/cubes/quote.json?code=ZH976766,SP1034535,SP1012810,ZH1160206,ZH2003755,ZH1996976,ZH1079481,ZH1174824,ZH1079472,SP1040320 """ page = json_response['page'] max_page = json_response['maxPage'] if page < max_page: url = f'{self.cube_discover_url}{page + 1}' yield scrapy.Request(url, headers=self.send_headers)
def parse(self, response: Response): """ 解析并填充item :param response: :return: """ res_json = response.text extract_status = Compose(json.loads, SelectJmes("status")) status = extract_status(res_json) if status is None: # 成功的情况 extract_result = Compose(json.loads, SelectJmes("content"), SelectJmes("positionResult"), SelectJmes("result")) result_list = extract_result(response.text) for res in result_list: loader = LagouItemLoader(item=LagouItem()) loader.add_value("post_time", res) loader.add_value("job_name", res) loader.add_value("salary", res) loader.add_value("place", res) loader.add_value("job_nature", res) loader.add_value("experience", res) loader.add_value("education", res) loader.add_value("job_kind", res) loader.add_value("advantage", res) loader.add_value("company_name", res) loader.add_value("company_size", res) loader.add_value("company_industry", res) loader.add_value("id", res) loader.add_value( "link", self.job_detail_url.format( id=loader.get_output_value("id"))) this_item = loader.load_item() yield this_item # yield Request( # url=this_item.get("link"), # headers=self.header_dict, # meta={"cookiejar": uuid.uuid4(), "item": this_item}, # callback=self.parse_other, # priority=5, # ) else: # 若请求失败,则重新请求一个主页,获得cookies,然后再次发起请求 key = uuid.uuid4() yield Request(url=self.get_cookies_url, callback=self.empty, meta={"cookiejar": key}, headers=self.header_dict, priority=5, dont_filter=True) yield FormRequest(url=response.url, formdata={ "first": "true", "pn": str(response.meta['page']), 'kd': "" }, callback=self.parse, meta={ "cookiejar": key, "page": response.meta['page'] }, method="POST", headers=self.header_dict, priority=4, dont_filter=True)
import re import json import requests from scrapy.loader.processors import SelectJmes url = 'https://detail.m.tmall.com/item.htm?spm=a1z10.3-b-s.w4011-14902430375.63.79a74ce1CfpmiQ&id=556955516923&rn=8ee79051f8726d88073713f6e0fafdd1&abbucket=13&sku_properties=1627207:28329' session = requests.Session() resp = session.get(url) detail = re.findall('DATA_Detail.*?\n?({.*})', resp.text)[0] mdskip = re.findall('DATA_Mdskip.*?\n?({.*})', resp.text)[0] detail = json.loads(detail) mdskip = json.loads(mdskip) item = SelectJmes("item.[title,tmallDescUrl,subtitle,images,itemId]") item = item(detail) props = SelectJmes("props.groupProps") props = props(detail) skuBase = SelectJmes("skuBase.props[].[name,values]") skuBase = skuBase(detail) seller = SelectJmes("seller") seller = seller(detail) price = SelectJmes("price") price = price(mdskip) kuaidi = SelectJmes("delivery.[areaId,from,postage]") kuaidi = kuaidi(mdskip) sellCount = SelectJmes("item.sellCount") sellCount = sellCount(mdskip) print(item, props, skuBase, seller, price, kuaidi, sellCount)
#!/usr/bin/env python # -*- encoding: utf-8 -*- from scrapy.loader.processors import TakeFirst from scrapy.loader.processors import Join from scrapy.loader.processors import Compose from scrapy.loader.processors import MapCompose from scrapy.loader.processors import SelectJmes processor = SelectJmes('foo') print(processor({'foo': 'bar'})) processor = MapCompose(str.upper, lambda s: s.strip()) print(processor(['hello', 'world', 'python'])) processor = Compose(str.upper, lambda s: s.strip()) print(processor(" hello world")) processor = Join(',') # processor = Join() print(processor(['one', 'two', 'three'])) processor = TakeFirst() print(processor(['', 1, 2, 3]))
+ Giá trị input của processor này được lặp lại và hàm đầu tiên được áp dụng cho mỗi phần tử(element). Kết quả của các lời gọi hàm này (một cho mỗi phần tử) được ghép vào để (constructed) xây dựng một iterable mới, sau đó sử dụng để áp dụng hàm thứ hai,vv, cho đến khi hàm cuối cùng được áp dụng cho mỗi giá trị của list các giá trị được thu thập trước đó. Các giá trị output của hàm cuối cùng được kết nối với nhau để tạo ra output của processor này. + Mỗi hàm cụ thể trả về 1 giá trị hoặc 1 list các giá trị. Các hàm cũng có thể trả về 'None' trong trường hợp này, output của hàm đó sẽ bị loại bỏ để tiếp tục xử lý chuỗi. + Processor này cung cấp 1 cách thuận tiện để biên soạn các hàm chỉ làm việc với giá trị đơn(thay vì là các iterable). Vì lý do này processor MapCompose thường được sử dụng làm input processor, vì dữ liệu thường được trích xuất bằng cách sử dụng phương thức extract() của selectors, nó trả về một danh sách các chuỗi unicode. - Vd : >>> def filter_world(x): ... return None if x == 'world' else x ... >>> from scrapy.loader.processors import MapCompose >>> proc = MapCompose(filter_world, unicode.upper) >>> proc([u'hello', u'world', u'this', u'is', u'scrapy']) [u'HELLO, u'THIS', u'IS', u'SCRAPY'] __________________________________________________________ 'class scrapy.loader.processors.SelectJmes(json_path)' - Truy vấn value sử dụng đường dẫn json, được cung cấp tới constructor và trả về output. Yêu cầu jmespath (https://github.com/jmespath/jmespath.py) để chạy. Processor này mỗi lần chỉ nhận 1 input - Vd : >>> from scrapy.loader.processors import SelectJmes, Compose, MapCompose >>> proc = SelectJmes("foo") #for direct use on lists and dictionaries >>> proc({'foo': 'bar'}) 'bar' >>> proc({'foo': {'bar': 'baz'}}) {'bar': 'baz'} __ - Làm việc với Json : >>> import json >>> proc_single_json_str = Compose(json.loads, SelectJmes("foo")) >>> proc_single_json_str('{"foo": "bar"}') u'bar' >>> proc_json_list = Compose(json.loads, MapCompose(SelectJmes('foo'))) >>> proc_json_list('[{"foo":"bar"}, {"baz":"tar"}]') [u'bar']
class LagouItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() link = scrapy.Field() # url id = scrapy.Field( input_processor=Compose(TakeFirst(), SelectJmes("positionId")), output_processor=TakeFirst(), ) # rloc post_time = scrapy.Field(input_processor=Compose(TakeFirst(), SelectJmes("createTime"), process_post_time), ) # lastmod job_name = scrapy.Field(input_processor=Compose( TakeFirst(), SelectJmes("positionName")), ) # title salary = scrapy.Field(input_processor=Compose(TakeFirst(), SelectJmes("salary")), ) # salary place = scrapy.Field(input_processor=Compose(TakeFirst(), SelectJmes("city")), ) # city job_nature = scrapy.Field(input_processor=Compose(TakeFirst(), SelectJmes("jobNature")), ) # type experience = scrapy.Field(input_processor=Compose(TakeFirst(), SelectJmes("workYear")), ) # experience education = scrapy.Field(input_processor=Compose(TakeFirst(), SelectJmes("education")), ) # education # job_number = scrapy.Field() # number job_kind = scrapy.Field(input_processor=Compose(TakeFirst(), SelectJmes("firstType")), ) # jobsecondclass advantage = scrapy.Field(input_processor=Compose( TakeFirst(), SelectJmes("positionAdvantage"), replace_all_n), ) # ori_welfare company_name = scrapy.Field(input_processor=Compose( TakeFirst(), SelectJmes("companyFullName")), ) # officialname company_size = scrapy.Field(input_processor=Compose( TakeFirst(), SelectJmes("companySize")), ) # size company_industry = scrapy.Field(input_processor=Compose( TakeFirst(), SelectJmes("industryField")), ) # industry job_content = scrapy.Field(input_processor=Compose(TakeFirst(), replace_all_n), ) job_place = scrapy.Field(input_processor=Compose(TakeFirst(), replace_all_n, process_job_place), ) company_homepage = scrapy.Field(input_processor=Compose( TakeFirst(), replace_all_n), ) # official
"true", "type": "jsonp", "dataType": "jsonp", "callback": "mtopjsonp1", "data": "{\"exParams\":\"{\\\"id\\\":\\\"560129744052\\\",\"itemNumId\":\"560129744052\"}" } headers = { 'cache-control': "no-cache", } response = requests.request("GET", url, headers=headers, params=querystring) data = json.loads(response.text.replace('mtopjsonp1(', '').rstrip(')'))['data'] skuBase = SelectJmes("skuBase.props") skuBase = skuBase(data) props = SelectJmes("props") props = props(data) seller = SelectJmes("seller") seller = seller(data) item = SelectJmes("item.[title,tmallDescUrl,subtitle,images,itemId]") item = item(data) apiStack = SelectJmes("apiStack[].value") apiStack = json.loads(apiStack(data)[0]) kuaidi = SelectJmes("delivery.[areaId,from,postage]") kuaidi = kuaidi(apiStack) price = SelectJmes("price") price = price(apiStack) sellCount = SelectJmes("item.sellCount") sellCount = sellCount(apiStack)