def parse(self, response): post_nodes = response.xpath('//*[@id="archive"]//div[@class="post-thumb"]') # 首先获取文章的所有url print(post_nodes) for post_node in post_nodes: post_url = post_node.xpath('.//a/@href').extract_first() # 文章的地址 print(post_url) # image_url = post_node.xpath('.//img/@src').extract_first() # 文章图片的地址 # print(image_url) #image_url = parse.urljoin(response.url, image_url) # 以前的文章图片是在本域名下,所以拼接一下。 #yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_detail,meta={'image_url': image_url}) # 用回调函数分析文章页面的元素 # 提取下一页的url next_url = response.xpath('//*[@id="archive"]//a[contains(@class,"next")]/@href').extract_first() if next_url: yield Request(url=next_url, callback=self.parse) def parse_detail(self, response): # 通过Itemloader加载Item image_url = response.meta.get('image_url') # 传递图片的url item_loader = ArticleItemLoader(item=ArticleItem(), response=response) # 将类设置为自定义的Itemloader类 item_loader.add_xpath('title', '//*[@class="entry-header"]/h1/text()') # 通过xpath来提取数据 item_loader.add_value('url', response.url) # 直接添加值 item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_xpath('create_date', '//p[@class="entry-meta-hide-on-mobile"]/text()[1]') item_loader.add_value('image_url', [image_url]) item_loader.add_xpath('praise_nums', "//span[contains(@class,'vote-post-up')]/h10/text()") item_loader.add_xpath('fav_nums', "//span[contains(@class,'bookmark-btn')]/text()") item_loader.add_xpath('comment_nums', "//a[@href='#article-comment']/span/text()") item_loader.add_xpath('content', '//*[@class="entry"]/p | //*[@class="entry"]/h3 | //*[@class="entry"]/ul') article_item = item_loader.load_item() # 将规则进行解析,返回的是list yield article_item pass
def parse(self, response): district_name = response.xpath( '//div[@class="qxName"]/a[not(@class="org bold") and not(contains(text(),"周边"))]/text()' ).extract() district_url = response.xpath( '//div[@class="qxName"]/a[not(@class="org bold") and not(contains(text(),"周边"))]/@href' ).extract() for name, url in zip(district_name, district_url): print("开始区=========") print(url) yield scrapy.Request(url=response.urljoin(url), callback=self.town) time.sleep(random.randint(1, 3))
def town(self, response): town_names = response.xpath( '//div[@class="option-list sub-option-list gio_plate"]/a[@class!="on"]/text()' ).extract() town_urls = response.xpath( '//div[@class="option-list sub-option-list gio_plate"]/a[@class!="on"]/@href' ).extract() for name, url in zip(town_names, town_urls): print("开始镇=========") print(url) yield scrapy.Request(url=response.urljoin(url), callback=self.town_data) time.sleep(random.randint(1, 3))
def town(self, response): town_names = response.xpath( '//p[@id="shangQuancontain"]/a[not(@class="org bold")]/text()' ).extract() town_urls = response.xpath( '//p[@id="shangQuancontain"]/a[not(@class="org bold")]/@href' ).extract() for name, url in zip(town_names, town_urls): print("开始镇=========") print(url) yield scrapy.Request(url=response.urljoin(url), callback=self.town_data) time.sleep(random.randint(1, 3))
def town(self, response): global page_num town_names = response.xpath( '//div[@class="sub-items"]/a[@data-id!="全部"]/@data-id').extract() town_urls = response.xpath( '//div[@class="sub-items"]/a[@data-id!="全部"]/@href').extract() page_num = 1 for name, url in zip(town_names, town_urls): print("开始镇===========" + url) yield scrapy.Request(url=url, headers=self.headers, callback=self.town_data) time.sleep(random.randint(1, 3))
def parse_category(self, response): total_pages = int(response.xpath('//span[contains(@class,"pageNumbersInfo")]/b/text()').re('\d+$')[0]) ads_available = response.xpath('//table[contains(@class,"tablereset")]//a[contains(@href,"act=4")]') if not ads_available: logging.error(f"[#!#] No ads for | {response.meta['manufacturer']} {response.meta['model']} |") return {} logging.warning( f"Processing total of {len(ads_available)} posts for {response.meta['manufacturer']} {response.meta['model']}") CarsCrawlerMobile.processed_cars_count[ response.meta['manufacturer']] = CarsCrawlerMobile.processed_cars_count.get(response.meta['manufacturer'], 0) + len(ads_available) url_without_page_number = "=".join(response.url.split('=')[:-1]) for page_number in range(1, total_pages + 1): next_page = f'{url_without_page_number}={page_number}' yield Request(next_page, callback=self._parse_page, meta=response.meta)
def parse_detail(self, response): """提取内容""" node_list = response.xpath("//div[@id='J_goodsList']/ul/li") cnt = 0 print(len(node_list)) for node in node_list: item = JdItem() item['spu_id'] = node.xpath("./@data-spu").extract_first() item['sku_id'] = node.xpath("./@data-sku").extract_first() item['name'] = node.xpath( ".//div[@class='p-name p-name-type-2']/a/em/text()" ).extract_first() item['price'] = node.xpath( ".//div[@class='p-price']/strong/i/text()").extract_first() detail_url = node.xpath( ".//div[@class='p-img']/a/@href").extract_first() item['detail_url'] = 'https:' + detail_url if detail_url else '' default_url = node.xpath( ".//div[@class='p-img']/a/img/@src").extract_first() item['default_url'] = 'https:' + default_url if default_url else '' item['comment'] = node.xpath( ".//div[@class='p-commit']/strong/a/text()").extract_first() item['shop'] = node.xpath( ".//div[@class='p-shop']/span/a/text()").extract_first() item['is_self'] = node.xpath( ".//div[@class='p-icons']/i[1]/text()").extract_first() yield item # yield scrapy.Request(url=url) print('--' * 40) print(cnt)
def _parse_ad(self, response): price, title, *_ = sorted(response.xpath('//div/strong/text()').extract()) car_details = dict(zip(response.xpath('//ul[@class="dilarData"]/li/text()').extract()[::2], response.xpath('//ul[@class="dilarData"]/li/text()').extract()[1::2])) car_features = [x[2:] for x in response.xpath('//div[contains(@style,"margin-bottom:") and contains(.,"•")]/text()').extract()] car_features_mapped = {x: 1 if x in car_features else 0 for x in FEATURES_LOOKUP} ad_description = "\n".join( response.xpath('//div[contains(.,"Допълнителна")]/following::table[1]/tr/td/text()').extract()) yield { 'manufacturer': response.meta['manufacturer'].upper(), 'model': response.meta['model'].upper(), 'date': datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S'), 'ad_title': title, 'price': price, 'ad_description': ad_description, **car_details, **car_features_mapped, }
def parse(self, response): district_name = response.xpath( '//span[@class="item-title" and contains(text(), "区域")]/../span[@class="elems-l"]/a[@class="" and not(contains(text(),"周边"))]/text()' ).extract() district_url = response.xpath( '//span[@class="item-title" and contains(text(), "区域")]/../span[@class="elems-l"]/a[@class="" and not(contains(text(),"周边"))]/@href' ).extract() dy = int(datetime.date.today().day) count = 1 for name, url in zip(district_name, district_url): print("开始区========") print(url) if dy == count: yield scrapy.Request(url=url, headers=self.headers, callback=self.town, meta={'name': name}) time.sleep(random.randint(1, 3)) count = count + 1
def getTho(sefl,response): hxs = Selector(response) b=response.xpath("//p[contains(@class,'Poem_Han')]") if b: sites=response.xpath("//td[contains(@valign, 'top')]/p//text()") else: sites = hxs.xpath('//div[contains(@id,"PoemBody")]//text()') author=hxs.xpath('//a[contains(@href,"viewauthor")]/b/text()')[0].extract().encode('utf-8') title=hxs.xpath('//a[contains(@name,"POEM")]/text()').extract()[0].encode('utf-8') # print author # print title filename="("+author+") "+title+".txt" filename=unicode(filename,"utf-8") # print filename print("***************************************************************************************") with open(filename,'w') as thefile: for site in sites: print>>thefile,site.extract().encode('utf-8')
def getTho(sefl,response): hxs = Selector(response) a=response.xpath("//p[contains(@class,'Normal')]//text()") a=[''.join(a1 for a1 in a.extract())] #author=hxs.xpath('//a[contains(@href,"viewauthor")]/b/text()')[0].extract().encode('utf-8') title=hxs.xpath('//div/h1//text()')[0].extract() print title.encode('utf-8') # print author # print title filename=title+".txt" #filename=unicode(filename,"utf-8") # print filename print("***************************************************************************************") with open(filename,'wb') as thefile: for a1 in a: print>>thefile,a1.encode('utf-8')
def parse(self, response): selector = Selector(response) # // dl // dt / following - sibling:: * infos = response.xpath("//dl//dd/a/@href").extract() # print infos for v in infos: try: # print v response = scrapy.Request("https://www.booktxt.net/0_790/" + v, dont_filter=True, callback=self.parse_Item) yield response # item[''] = v.xpath('') # yield item except IndexError: pass
def parse(self, response): hack = response.xpath('//*[@id="container"]') post_title = response.xpath('//*[@class=" subject_new"]/a/text()') post_time = response.xpath('//*[@class="lastpost smalltext"]/text()') post_lastpost = response.xpath('//*[@class="lastpost smalltext"]/a/text()') post_author = response.xpath('//*[@class="author smalltext"]/a/text()') post_link = response.xpath('//*[@class=" subject_new"]/a/@href') items = [] for post_title, post_time, post_author, post_lastpost, post_link in zip(post_title, post_time, post_author, post_lastpost, post_link): item = HackforumsItem() item['Title'] = re.sub(r'[^a-z A-Z0-9?]', '', post_title.extract().strip()) item['Title'] = post_time.extract().strip() item['Lastpost'] = post_lastpost.extract().strip() item['Author'] = post_author.extract().strip() item['Link'] = post_link.extract().strip() items.append(item) yield item next_page_url = response.xpath().extract_first() absolute_next_page_url = response.urljoin(next_page_url) yield scrapy.Request(absolute_next_page_url, callback=self.parse)
def town_data(self, response): ershou = ajk() data_area = response.xpath('//div[@_soj="xqlb"]') print(len(data_area)) global page_num names = [] prices = [] bdyears = [] bdaddrs = [] bddists = [] lats = [] lngs = [] cdates = [] for data in data_area: name = data.xpath('div[@class="li-info"]/h3/a/text()').extract() price = data.xpath( 'div[@class="li-side"]/p/strong/text()').extract() bdyear = data.xpath( 'div[@class="li-info"]/p[@class="date"]/text()').extract() bdaddr = data.xpath( 'div[@class="li-info"]/address/text()').extract() if name: names.append(name[0]) # gpslocation = self.get_gps(name[0]) # if gpslocation: # lats.append(gpslocation['lat']) # lngs.append(gpslocation['lng']) # elif not gpslocation: # lats.append(' ') # lngs.append(' ') lats.append(' ') lngs.append(' ') if price: prices.append(price[0]) else: prices.append('暂无均价') if bdyear: bdyears.append(self.get_year(bdyear[0])) else: bdyears.append('9999') if bdaddr: address = bdaddr[0].strip().lstrip().rstrip() address = unicodedata.normalize('NFKC', address) print(address) m = re.findall("\[.+\-", address) bddists.append(m[0].replace("[", "").replace("-", "").replace( "]", "")) m = re.findall("\].*", address) bdaddrs.append(m[0].replace("[", "").replace("]", "").lstrip()) else: bdaddrs.append('暂无地址') bddists.append('暂无地址') dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") cdates.append(dt) else: break assert len(names) == len(prices) for block_name, block_price, block_bdyear, block_bdaddr, block_bddist, block_lat, block_lng, block_date in zip( names, prices, bdyears, bdaddrs, bddists, lats, lngs, cdates): ershou['house_name'] = block_name ershou['house_price'] = block_price ershou['house_bdyear'] = block_bdyear ershou['house_bdaddr'] = block_bdaddr ershou['house_bddist'] = block_bddist ershou['house_lat'] = block_lat ershou['house_lng'] = block_lng ershou['craw_date'] = block_date ershou['source_web'] = "anjuke" yield ershou next_link = response.xpath( '//div[@class="multi-page"]/a[contains(text(), "下一页")]/@href' ).extract() print(next_link) if next_link: url = next_link[0] page_num = page_num + 1 print('next page =============' + url) time.sleep(random.randint(1, 3)) yield scrapy.Request(url=url, headers=self.headers, callback=self.town_data)
def town_data(self, response): ershou = ajk() data_area = response.xpath( '//div[@class="houseList"]/div[@class="list rel"]') names = [] prices = [] bdyears = [] bdaddrs = [] bddists = [] lats = [] lngs = [] cdates = [] for data in data_area: print("开始小区=========") name = data.xpath( 'dl[@class="plotListwrap clearfix"]/dd/p/a[@class="plotTit"]/text()' ).extract() price = data.xpath( 'div[@class="listRiconwrap"]/p[@class="priceAverage"]/span[1]/text()' ).extract() bdaddr = data.xpath( 'dl[@class="plotListwrap clearfix"]/dd/p[2]/text()').extract() bddistrict = data.xpath( 'dl[@class="plotListwrap clearfix"]/dd/p[2]/a[1]/text()' ).extract() bdyear = data.xpath( 'dl[@class="plotListwrap clearfix"]/dd/ul[@class="sellOrRenthy clearfix"]/li[3]/text()' ).extract() if name: names.append(name[0].strip().rstrip().lstrip()) lats.append(" ") lngs.append(" ") bddists.append(bddistrict[0]) bdaddrs.append(" ") if bdyear: bdyears.append(bdyear[0]) else: bdyears.append('9999') if price: tmp = re.findall("[-+]?\d+[\.]?\d*", price[0].strip().rstrip().lstrip()) if tmp: prices.append(tmp[0]) elif not tmp: prices.append('暂无均价') elif not price: prices.append('暂无均价') dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") cdates.append(dt) for block_name, block_price, block_bdyear, block_bdaddr, block_bddist, block_lat, block_lng, block_date in zip( names, prices, bdyears, bdaddrs, bddists, lats, lngs, cdates): ershou['house_name'] = block_name ershou['house_price'] = block_price ershou['house_bdyear'] = block_bdyear ershou['house_bdaddr'] = block_bdaddr ershou['house_bddist'] = block_bddist ershou['house_lat'] = block_lat ershou['house_lng'] = block_lng ershou['craw_date'] = block_date ershou['source_web'] = "fang" yield ershou next_link = response.xpath( '//div[@class="fanye gray6"]/a[contains(text(), "下一页")]/@href' ).extract() if next_link: url = next_link[0] print('next page =============' + url) time.sleep(random.randint(1, 3)) yield scrapy.Request(url=response.urljoin(url), callback=self.town_data)
def town_data(self, response): ershou = ajk() data_area = response.xpath( '//div[@class="list-wrap"]/ul[@class="house-lst"]/li') names = [] prices = [] bdyears = [] bdaddrs = [] lats = [] lngs = [] cdates = [] for data in data_area: name = data.xpath('div[@class="info-panel"]/h2/a/text()').extract() price = data.xpath( 'div[@class="info-panel"]/div[@class="col-3"]/div[@class="price"]/span[@class="num"]/text()' ).extract() bdaddr = data.xpath( 'div[@class="info-panel"]/div[@class="col-1"]/div[@class="where"]/a[@class="actshowMap_list"]/@xiaoqu' ).extract() bddistrict = data.xpath( 'div[@class="info-panel"]/div[@class="col-1"]/div[@class="where"]/a[@class="actshowMap_list"]/@districtname' ).extract() bdyear = data.xpath( 'div[@class="info-panel"]/div[@class="col-1"]/div[@class="other"]/div[@class="con"]/text()' ).extract() if name: names.append(name[0].strip().rstrip().lstrip()) if bdaddr[0]: tmp = re.findall("[-+]?\d+[\.]?\d*", bdaddr[0]) if tmp: lngs.append(tmp[0]) lats.append(tmp[1]) else: lats.append(" ") lngs.append(" ") else: lats.append(" ") lngs.append(" ") bdaddrs.append(bddistrict[0]) if (len(bdyear) >= 4): if bdyear[3]: tmp = bdyear[3].strip().rstrip().lstrip() if tmp: bdyears.append(tmp) elif not tmp: bdyears.append('9999') elif not bdyear[3]: bdyears.append('9999') else: bdyears.append('9999') if price: tmp = re.findall("[-+]?\d+[\.]?\d*", price[0].strip().rstrip().lstrip()) if tmp: prices.append(tmp[0]) elif not tmp: prices.append('暂无均价') elif not price: prices.append('暂无均价') dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") cdates.append(dt) for block_name, block_price, block_bdyear, block_bdaddr, block_lat, block_lng, block_date in zip( names, prices, bdyears, bdaddrs, lats, lngs, cdates): ershou['house_name'] = block_name ershou['house_price'] = block_price ershou['house_bdyear'] = block_bdyear ershou['house_bdaddr'] = block_bdaddr ershou['house_bddist'] = block_bdaddr ershou['house_lat'] = block_lat ershou['house_lng'] = block_lng ershou['craw_date'] = block_date ershou['source_web'] = "lianjia" yield ershou next_link = response.xpath( '//div[@class="page-box house-lst-page-box"]/a[contains(text(), "下一页")]/@href' ).extract() if next_link: url = next_link[0] print('next page =============' + url) time.sleep(random.randint(1, 3)) yield scrapy.Request(url=response.urljoin(url), callback=self.town_data)
def parse(self, response): print("Processing --> " + response.url) responseMetaInfo = response.request.meta if response.request.meta else None # Scraping data from the page using Xpath to avoid irrelevant data data = response.xpath( '//*[not(ancestor-or-self::script or descendant-or-self::script or ancestor-or-self::noscript or descendant-or-self::noscript or ancestor-or-self::style or descendant-or-self::style)]/text()' ).getall() seller_relation = '' adstxtstatus = 0 track = 0 # Storing appropriate HTTP Status Codes if response.status == 200: httpstatus = response.status adstxtstatus = 1 else: if 'redirect_reasons' in responseMetaInfo: httpstatus = responseMetaInfo['redirect_reasons'][ len(responseMetaInfo['redirect_reasons']) - 1] adstxtstatus = 2 else: httpstatus = response.status adstxtstatus = 3 # Retrieving original URL if there's a redirect if responseMetaInfo.get('redirect_urls'): url = responseMetaInfo['redirect_urls'][0] else: url = response.request.url # Writing scraped Data into a text file with open('Temp.txt', 'w', newline='\n', encoding='utf-8') as tempFile: for entry in data: tempFile.write(entry.rstrip("\r\n")) tempFile.close() # Reading the temporarily stored Data file = open('Temp.txt', 'r') # Storing ads.txt entries in dict (scraped_info) to be written in the final CSV output for row in file: seller_relation = "Direct" if "direct" in str.lower( row) else "Reseller" if "direct" in str.lower(row) or "reseller" in str.lower(row): track = 1 self.scraped_info = { 'Domain Name': (url.replace('http://', '')).replace('/ads.txt', ''), 'Ads.txt': adstxtstatus, 'Ads.txt Line': row.rstrip("\n"), 'Seller Relation': seller_relation, 'HTTP Status Code': httpstatus, } yield self.scraped_info if track == 0: self.scraped_info = { 'Domain Name': (url.replace('http://', '')).replace('/ads.txt', ''), 'Ads.txt': adstxtstatus, 'Ads.txt Line': row.rstrip("\n"), 'Seller Relation': seller_relation, 'HTTPStatusCode': httpstatus, } yield self.scraped_info
def _parse_page(self, response): ads = ["https:" + x for x in response.xpath('//td[@class="valgtop"]//a[@class="mmm"]/@href').extract()] for ad_url in ads: yield Request(ad_url, callback=self._parse_ad, meta=response.meta)
def getTho(self,response): hxs = Selector(response) a=response.xpath('///td//a[contains(@href,"java")]/text()') with open('testLink.txt','wb') as thefile: for a1 in a: print>> thefile,a.extract()