def get_data(self, resp): '''取:解析数据''' '''{"name": "??", "detail_url":"http://www.dachenglaw.com/cn/professionals/qi.ao.html", "email": "*****@*****.**", "position":"?????", "location": "??"}''' if not resp: # print(f"请求[{resp.url}],响应为空,不做解析") return html = etree.HTML(resp.content.decode()) trs = html.xpath('//tbody/tr') items = [] for tr in trs: name = tr.xpath('./td[1]/a/text()') detail_url = 'http://www.dachenglaw.com' + extra_first( tr.xpath('./td[1]/a/@href')) email = tr.xpath('./td[2]/text()') position = tr.xpath('./td[3]/text()') location = tr.xpath('./td[4]/text()') '''''' #print(name,detail_url,email,position,location) item = { "name": extra_first(name), "detail_url": detail_url, "email": extra_first(email), "position": extra_first(position), "location": ''.join(extra_first(location).split()) } yield item
def get_data(self, resp): if not resp: # print(f'请求[{resp.url}],响应为空,不做解析') return html = etree.HTML(resp.content.decode()) lis = html.xpath('//div[@class="pc_temp_songlist "]/ul/li') # print(len(lis)) for ll in lis: id = ll.xpath("./span[3]/text()") title = ll.xpath("./@title") detail_url = ll.xpath("./a/@href") time = ll.xpath("./span[4]/span/text()") item = { "id": extra_first(id), "title": extra_first(title), "detail_url": extra_first(detail_url), "time": extra_first(time), } yield item
def get_data(self, resp): if not resp: # print(f'请求[{resp.url}],响应为空,不做解析') return html = etree.HTML(resp.content.decode()) tables = html.xpath('//div/table') # print(len(dds)) for table in tables[1:]: title = table.xpath("./tr/td[2]/div/a/text()") detail_url = table.xpath("./tr/td[2]/div/a/@href") info = table.xpath("./tr/td[2]/p/text()") score = table.xpath("./tr/td[2]/div[2]/span[2]/text()") desc = table.xpath('./tr/td[2]/p[2]/text()') item = { "title": extra_first(title), "detail_url": extra_first(detail_url), "info": extra_first(info), "score": extra_first(score), "desc": extra_first(desc), } yield item
def get_data(self, resp): if not resp: return html = etree.HTML(resp.content.decode()) trs = html.xpath('//dl/dd') items = [] for i in trs: id = i.xpath('./i/text()') title = i.xpath('./a/@title') detail_url = i.xpath('./a/@href') actors = i.xpath('./div/div/div/p[2]/text()') releasetime = i.xpath('./div/div/div/p[3]/text()') score1 = i.xpath('./div/div/div[2]/p/i[1]/text()') score2 = i.xpath('./div/div/div[2]/p/i[2]/text()') item = { "id": extra_first(id), "detail_url": 'https://maoyan.com' + extra_first(detail_url), "title": extra_first(title), "actors": extra_first(actors), "releasetime": extra_first(releasetime), "score": extra_first(score1) + extra_first(score2) } yield item
def get_data(self, resp): if not resp: # print(f'请求[{resp.url}],响应为空,不做解析') return html = etree.HTML(resp.content.decode()) lis = html.xpath('//ol/li') # print(len(lis)) for li in lis: id = li.xpath("./div/div/em/text()") title = li.xpath("./div/div/a/img/@alt") detail_url = li.xpath("./div/div/a/@href") content = li.xpath("./div/div[2]/div[2]/p/text()") score = li.xpath("./div/div[2]/div[2]/div/span[2]/text()") info = li.xpath('./div/div[2]/div[2]/p/span/text()') item = { "id": extra_first(id), "title": extra_first(title), "detail_url": extra_first(detail_url), "content": extra_first(content), "score": extra_first(score), "info": extra_first(info), } yield item