def parse_item(self, response): item = DongguanItem() # 链接 item['url'] = response.url # 標題 title = response.xpath( '//div[@class="wzy1"]//td/span[@class="niae2_top"]/text()' ).extract()[0] item['title'] = title # 編號 number = response.xpath('//div[@class="wzy1"]//td/span[2]/text()' ).extract()[0].split(':')[-1] item['number'] = number # 内容 content = response.xpath( '//div[@class="wzy1"]//tr[1]/td[@class="txt16_3"]/text()').extract( ) if (len(content)) == 0 or (len(content) == 1 and content[0].replace(u'\xa0', u'') == ''): content = response.xpath( '//div[@class="wzy1"]//tr[1]/td[@class="txt16_3"]/div[@class="contentext"]/text()' ).extract() item['content'] = "".join(content).strip() else: item['content'] = "".join(content).strip() # 交给管道 yield item
def parse_item(self, response): item = DongguanItem() # 编号 item['number'] = response.xpath( "//div[@class='pagecenter p3']//strong/text()").extract( )[0].replace('\xa0', '').split(':')[-1] # 链接 item['url'] = response.url # 标题 item['title'] = response.xpath( "//div[@class='pagecenter p3']//strong/text()").extract( )[0].replace('\xa0', '').split('编号')[0] # 内容,先使用有图片情况下的匹配规则,如果有内容,返回所有内容的列表集合 content = "".join( response.xpath( "//div[@class='pagecenter p3']//div[@class='c1 text14_2']/text()" ).extract()).replace("\xa0", "") #判断内容 if content != "": #若不为空则为纯文本 item['content'] = content else: #若为空则查找目标标签 item['content'] = "".join( response.xpath( "//div[@class='pagecenter p3']//div[@class='c1 text14_2']/div[@class='contentext']/text()" ).extract()).replace("\xa0", "") # 交给管道 yield item
def parse(self, response): item = DongguanItem() node_list = response.xpath( '//*[@id="morelist"]/div/table[2]//tr/td/table') for node in node_list: id_list = node.xpath('.//tr/td[1]/text()').extract() title_list = node.xpath('.//tr/td[2]/a[2]/text()').extract() address_list = node.xpath('.//tr/td[2]/a[3]/text()').extract() handing_list = node.xpath('.//tr/td[3]/span/text()').extract() datime_list = node.xpath('.//tr/td[5]/text()').extract() # item['content'] = response.xpath('//div[@class="contentext"]/text() | //div[@class="c1 text14_2"]/text()').extract_first() for Id, title, address, handling, datime in zip( id_list, title_list, address_list, handing_list, datime_list): item['Id'] = Id item['title'] = title item['address'] = address item['handling'] = handling item['datime'] = datime yield item self.offset += 30 print('正在爬取' + str(self.offset / 30) + '页.....') if node_list == []: return None yield scrapy.Request(url='http://wz.sun0769.com/index.php/question/questionType?type=4&page=' + \ str(self.offset),callback=self.parse) if self.offset == 32460: continue else: yield scrapy.Request(url='http://wz.sun0769.com/index.php/question/questionType?type=4&page=' + \ str(self.offset),callback=self.parse)
def parse_item(self, response): # 提问:梁屋山边路与梁屋北路违停严重 编号:194771 item = DongguanItem() re_title = Selector(response=response).xpath( "/html/body/div[6]/div/div[1]/div[1]/strong/text()").extract()[0] # title =re.match(r'提问:(\S+) +编号:(\d+)',re_title).group(1) title = re_title.decode().encode('utf-8').split(" ")[0].split(':')[-1] # number =re.match(r'提问:(\S+) +编号:(\d+)',re_title).group(2) number = re_title.decode().encode('utf-8').split(" ")[-1].split( ':')[-1] content = Selector(response=response).xpath( "//html/body/div[6]/div/div[2]/div[1]/text()").extract() # re_date_time=Selector(response=response).xpath("//p[@class='te12h']/text()").extract() # data_time = re.match(r'\S+ 发言时间:(\w \w)',re_date_time).group(1) # date_time = re_date_time.split(' ')[1].split(':')[-1] status = Selector(response=response).xpath( "//div[@class='audit']/div[@class='cleft']/span/text()").extract() item['title'] = title item['number'] = number item['content'] = content[0] # item['date_time'] = date_time item['status'] = status[0] print '\n', re_title, '\n' yield item
def parse_res(self, response): # print(response.text) # item = DongguanItem() # doc = pq(etree.HTML(response.text)) # text = doc('.p3 .tgray14').text() # item['question'] = re.search('提问:(.*?)编号',text).group(1) # item['content'] = doc('.c1').text() # item['status'] = doc('.text14_2 .cleft').text().split(':')[-1] # item['time'] = doc('.text14_2 .cright .te12h').text().split(':')[-1] item = DongguanItem() item['question'] = response.xpath( '//div[@class="wzy1"]/table[1]//td[2]/span[1]/text()')[0].extract( ).strip().replace('\xa0', '') item['content'] = response.xpath( '//div[@class="wzy1"]/table[2]//tr[1]/td/text()')[0].extract( ).replace('\xa0', '') item['status'] = response.xpath( '//div[@class="wzy3_1"]/span/text()')[0].extract().replace( '\xa0', '') item['time'] = response.xpath( '//div[@class="wzy3_2"]/span[1]/text()')[0].extract().replace( '\xa0', '') yield item
def parse_item(self, response): item = DongguanItem() title_block = response.xpath( '//div[@class="pagecenter p3"]//strong[@class="tgray14"]/text()' ).extract()[0] title_list = title_block.split( u"\xa0\xa0") #html的空格转成unicode之后变成了\xa0了,所以需要通过这种方式分隔 # logging.debug(title_list) title = title_list[0].split(':')[1] code = title_list[1].split(':')[1] # logging.debug(title, code) #content可能会出现有图片的情况 #有图片,则匹配contentext,没有则匹配c1 text14_2 content = response.xpath('//div[@class="contentext"]/text()').extract() if len(content) == 0: content = response.xpath( '//div[@class="c1 text14_2"]/text()').extract() content = ''.join(content) content = content.replace(' ', '') # logging.debug(content) url = response.url item['title'] = title if title else '' item['content'] = content if content else '' item['url'] = url if url else '' item['code'] = code if code else '' yield item
def parse_item(self, response): item = DongguanItem() item['number'] = response.xpath( "//div[@class='pagecenter p3']//strong/text()").extract( )[0].replace('\xa0', '').split(':')[-1] item['url'] = response.url item['title'] = response.xpath( "//div[@class='pagecenter p3']//strong/text()").extract( )[0].replace('\xa0', '').split('编号')[0] #由于网页保存含有图片文本和纯文字问不问的标签不同,所以需要在查找的时候判断是否为纯文本内容 content = "".join( response.xpath( "//div[@class='pagecenter p3']//div[@class='c1 text14_2']/text()" ).extract()).replace("\xa0", "") #判断内容 if content != "": #若不为空则为纯文本 item['content'] = content else: #若为空则查找目标标签 item['content'] = "".join( response.xpath( "//div[@class='pagecenter p3']//div[@class='c1 text14_2']/div[@class='contentext']/text()" ).extract()).replace("\xa0", "") yield item
def parse_item(self, response): print("url:%s" % response.url) item = DongguanItem() # 标题 item['title'] = response.xpath( '//head/title/text()').extract()[0].replace('_阳光热线问政平台', "") #编号 item['number'] = ( response.xpath('//div[@class="pagecenter p3"]//strong//text()' ).extract()[0]).split(':')[-1] #帖子内容,默认取出有图片情况下的文字内容列表 content = response.xpath('//div[@class="contentext"]/text()').extract() #如果没有图片,则取出没有图片情况下的文字内容列表 if len(content) == 0: content = response.xpath( '//div[@class="c1 text14_2"]/text()').extract() item['content'] = "".join(content).strip() #链接 item['url'] = response.url #帖子状态 item['status'] = response.xpath( '//div[@class="audit"]//span/text()').extract()[0] #网友 item['net_friend'] = ( response.xpath('//div[@class="cright"]//p//text()').extract()[0] ).split("发言时间")[0].split(":")[1].strip() #时间 item['time'] = (response.xpath('//div[@class="cright"]//p//text()'). extract()[0]).split("发言时间")[1].strip() # yield item yield item
def parse_item(self, response): print response.url item = DongguanItem() # 标题 t_n = response.xpath( '//div[contains(@class, "pagecenter p3")]//strong/text()').extract( )[0] item['title'] = t_n.split()[-2][3:] # 编号 item['number'] = t_n.split()[-1].split(":")[-1] # 文字内容,默认先取出有图片情况下的文字内容列表 content = response.xpath('//div[@class="contentext"]/text()').extract() # 如果没有图片,则取出没有图片情况下的文字内容列表 if len(content) == 0: content = response.xpath( '//div[@class="c1 text14_2"]/text()').extract() # content为列表,通过join方法拼接为字符串,并去除首尾空格 item['content'] = "".join(content).strip() else: item['content'] = "".join(content).strip() # 链接 item['url'] = response.url yield item
def parse_item(self, response): item = DongguanItem() item['title'] = response.xpath( '//div[@class="greyframe"]//strong/text()').extract()[0] item['number'] = item['title'].split(' ')[-1].split(':')[-1] item['content'] = response.xpath( '//div[@class="contentext"]/text()').extract()[0] item['url'] = response.url yield item
def parse_item(self, response): item=DongguanItem() item['title'] = response.xpath('/html/body/div[6]/div/div[1]/div[1]/strong/text()').extract()[0] item['num']=item['title'].split(" ")[-1].split(':')[-1] item['content']=response.xpath('/html/body/div[6]/div/div[2]/div[1]/text()').extract()[0] item['url'] = response.url #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() yield item
def parse_item(self, response): # i = {} # i['title'] = response.xpath('//div[@class="pagecenter p3"]//strong[@class="tgray14"]/text()').extract()[0].split()[0] # i['number'] = response.xpath('//div[@class="pagecenter p3"]//strong[@class="tgray14"]/text()').extract()[0].split()[-1].split(':')[-1] # i['content'] = response.xpath('string(//div[@class="c1 text14_2"])').extract()[0].split() # return i item = DongguanItem() item_loader = QItemLoader(item=DongguanItem(), response=response) item_loader.add_xpath( 'title', '//div[@class="pagecenter p3"]//strong[@class="tgray14"]/text()') item_loader.add_xpath( 'number', '//div[@class="pagecenter p3"]//strong[@class="tgray14"]/text()') item_loader.add_xpath('content', 'string(//div[@class="c1 text14_2"])') item = item_loader.load_item() # 取出item yield item
def parse_details(self, response): item = DongguanItem() item['name'] = response.xpath( '//div[@class="pagecenter p3"]//strong[@class="tgray14"]/text()' ).extract()[0] print(item['name']) item['num'] = item['name'].split(' ')[-1].split(":")[-1] item['detail_link'] = response.url item['content'] = response.xpath( '//div[@class="content text14_2"]/div/text()').extract()[0] yield item
def parse_item(self, response): item = DongguanItem() item['title'] = response.xpath( '//div[contains(@class, "pagecenter p3")]//strong/text()').extract( )[0] item['number'] = item['title'].split(' ')[-1].split(":")[-1] item['content'] = response.xpath( '//div[@class="c1 text14_2"]/text()').extract()[0] item['url'] = response.url yield item
def list_detail(self, response): item = DongguanItem() doc = pq(response.text) tr_list = doc("#houseTable_1 tr:gt(0)").items() # 获取所有的tr, 第一行不要 for tr in tr_list: item['projectUrl'] = self.base_url + tr('td:eq(1) a').attr('href') # 取第一个td的href item['projectTitle'] = tr('td:eq(1) a').text() yield scrapy.Request( item['projectUrl'], callback=self.floorsDetail, meta={'item':copy.deepcopy(item)} )
def parse_item(self, response): print response.url item = DongguanItem() #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0] item['number'] = item['title'].split(' ')[-1].split(':')[-1] item['content'] = response.xpath('//div[@class="c1 text14_2"]/text()').extract()[0] item['url'] = response.url yield item
def parse_item(self, response): item = DongguanItem() item['question'] = response.xpath( "//div[@class='wzy1']//td/span[1]/text()").extract()[0] item['num'] = response.xpath( "//div[@class='wzy1']//td/span[2]/text()").extract()[0] item['context'] = response.xpath( "//div[@class='wzy1']//tr[1]/td[@class='txt16_3']/text()").extract( )[0] item['url'] = response.url yield item
def parse_item(self, response): item = DongguanItem() # 拿到提问信息,包括编号 title = response.xpath( '//div[@class="pagecenter p3"]//strong/text()').extract()[0] # 通过切片得到提问信息 item["question"] = title.split(' ')[1] # 通过切片得到编号 item["number"] = title.split(' ')[-1].split(':')[-1] item["url"] = response.url item["answer"] = response.xpath( '//div[@class="c1 text14_2"]/text()').extract()[0] yield item
def parse_item(self, response): item = DongguanItem() item['title'] = response.xpath( '//div[contains(@class, "pagecenter p3")]//strong/text()').extract( )[0] item['number'] = item['title'].split(' ')[-1].split(':')[-1] content = response.xpath('//div[@class="contentext"]/text()').extract() if len(content) == 0: content = response.xpath( '//div[@class="c1 text14_2"]/text()').extract() item['content'] = "".join(content).strip() else: item['content'] = "".join(content).strip() item['url'] = response.url yield item
def parse_item(self, response): # print response.url item = DongguanItem() item['title'] = response.xpath("//div[@class='pagecenter p3']//strong/text()").extract()[0] item['titleId'] = item['title'].strip().split(" ")[-1].split(":")[-1] # 如果是有图片时,存在class为contentext的div contentext = response.xpath("//div[@class='contentext']/text()").extract() if len(contentext) == 0: # 没有图片 contentext = response.xpath("//div[@class='c1 text14_2']/text()").extract() item['content'] = ''.join(contentext).strip() else: # 有图片 item['content'] = ''.join(contentext).strip() item['url'] = response.url yield item
def parse_item(self, response): item = DongguanItem() # 编号: item['number_title'] = response.xpath( '//div[@class="wzy1"]//td/span[2]/text()').extract()[0].split( ':')[-1].split(':')[-1] other = response.xpath( '//div[@class="wzy3_2"]/span/text()').extract()[0] # 网友: item['name_title'] = other.split()[0].split(':')[-1] # 时间: item['time_title'] = other.split()[1].split( ':')[-1] + ' ' + other.split()[-1] # 标题 item['title'] = response.xpath( '//div[@class="wzy1"]//td/span[1]/text()').extract()[0].split( ':')[-1] # 内容(无图片): # content = response.xpath('//div[@class="wzy1"]//tr/td[@class="txt16_3"]/text()').extract()[0] # 内容(有图片): content_has = response.xpath( '//div[@class="wzy1"]//td/div[@class="contentext"]/text()' ).extract() content_no = response.xpath( '//div[@class="wzy1"]//tr/td[@class="txt16_3"]/text()').extract() string_content = '' if len(content_has) == 0: for i in content_no: string_content += i.strip() item['content'] = string_content else: for i in content_has: string_content += i.strip() item['content'] = string_content # 处理状态: item['parsetype'] = response.xpath( '//div[@class="wzy3_1"]/span/text()').extract()[0] item['url'] = response.url yield item
def parse_item(self, response): print response.url item = DongguanItem() title = response.xpath( "//div[@class='pagecenter p3']//strong/text()").extract()[0] item['title'] = title.split(u":")[-1].split(":")[0][:-2] item['url'] = response.url item['number'] = title.split(":")[-1] content = response.xpath("//div[@class='contentext']/text()").extract() if len(content) == 0: item['content'] = " ".join( response.xpath( "//div[@class='c1 text14_2']/text()").extract()).strip() else: item['content'] = " ".join(content).strip() yield item
def parse_item(self, response): items = DongguanItem() temp = response.xpath( "//div[@class='pagecenter p3']//strong/text()").extract() # 防止空列表,有值列表取列表第一个,空列表取空字符串 if temp: temp = temp[0] else: temp = "" items['title'] = temp.strip().split('\xa0')[0].split(':')[-1] items['number'] = temp.strip().split('\xa0')[-1].split(':')[-1] items['content'] = clear_list( response.xpath("//div[@class='c1 text14_2']/text()").extract(), response) items['url'] = response.url yield items
def parse_item(self, response): content = ''.join( response.xpath( '//div[@class="contentext"]/text()').extract()).replace( u'\xa0', '').strip() if len(content) == 0: content = ''.join( response.xpath( '//div[@class="c1 text14_2"]/text()').extract()).replace( u'\xa0', '').strip() item = DongguanItem() item['title'] = response.xpath( '//div[@class="pagecenter p3"]//strong/text()').extract_first( ).replace(u'\xa0', ' ').strip() item['num'] = item['title'].split(':')[-1].strip() item['content'] = content item['url'] = response.url yield item
def parse_item(self, response): item = DongguanItem() #标题 item['title'] = response.xpath( '//div[contains(@class, "pagecenter p3")]//strong/text()').extract( )[0] #编号 item['number'] = item['title'].split(' ')[-1].split(':')[-1] # 内容,先使用有图片情况下的匹配规则,如果有内容,返回所有内容的列表集合 content = response.xpath('//div[@class="contentext"]/text()').extract() # 如果没有内容,则返回空列表,则使用无图片情况下的匹配规则 if len(content) == 0: content = response.xpath( '//div[@class="c1 text14_2"]/text()').extract() item['content'] = "".join(content).strip() else: item['content'] = "".join(content).strip() item['url'] = response.url yield item
def parse(self, response): url_list = response.xpath("//ul[@class='carlist clearfix js-top']/li") print(url_list,"*"*100) #遍历li for li in url_list: item = DongguanItem() # item['carurl'] = li.xpath("") #车名 item['carname'] = li.xpath(".//h2[@class='t']/text()").extract_first() #车的日期 item['date'] = li.xpath(".//div[@class='t-i']/text()").extract() # item['licheng'] = li.xpath(".//div[@class='t-i']/text()") item['prize'] = li.xpath(".//div[@class='t-price']/p/text()").extract_first() item['state'] = li.xpath(".//i[@class='i-orange']/text()").extract_first() yield item if self.offset <=161: self.offset =self.offset + 1 yield scrapy.Request(self.url + str(self.offset),callback=self.parse)
def parse(self, response): # print(response) tr_list = response.xpath('//table[2]/tr') # /td[3]/a[1]/text() for tr in tr_list: item = DongguanItem() item['num'] = tr.xpath('./td[1]/text()').extract_first() item['title'] = tr.xpath('./td[3]/a[1]/text()').extract_first() item['href'] = tr.xpath('./td[3]/a[1]/@href').extract_first() item['status'] = tr.xpath('./td[4]/span/text()').extract_first() item['name'] = tr.xpath('./td[5]/text()').extract_first() item['date'] = tr.xpath('./td[6]/text()').extract_first() yield scrapy.Request(item['href'], callback=self.detail_parse, meta={'item': item}) next_url = response.xpath( r'//div[@class="pagination"]//a[text()=">"]/@href').extract_first( ) if next_url is not None: yield scrapy.Request(next_url, callback=self.parse)
def parse_item(self, response): #print response.url item = DongguanItem() item['title'] = response.xpath( '//div[@class="pagecenter p3"]//strong/text()').extract()[0] item['number'] = item['title'].split(' ')[-1].split(':')[-1] #先取有图片的内容,如果有内容,返回所有列表内容的列表集合;如果没有图片,则这个列表为空 content = response.xpath('//div[@class="contentext"]/text()').extract() #再使用以下匹配规则 if len(content) == 0: content = response.xpath( '//div[@class="c1 text14_2"]/text()').extract() item['content'] = "".join(content).strip() else: item['content'] = "".join(content).strip() item['url'] = response.url #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() yield item
def parse_item(self, response): item = DongguanItem() # 网站 item['url'] = response.url # 标题 item['title'] = response.xpath( '//div[@class = "pagecenter p3"]//strong/text()').extract()[0] # 编号 item['num'] = item['title'].split(' ')[-1].split(":")[-1] # 内容,先使用取出有图片的情况下的匹配规则,如果有内容,返回所有内容的列表集合 content = response.xpath('//div[@class="contentext"]/text()').extract() #如果没有内容返回空列表,则使用无图片情况下的匹配规则 if len(content) == 0: content = response.xpath( '//div[@class="c1 text14_2"]/text()').extract() item['content'] = "".join(content).strip() else: item['content'] = "".join(content).strip() yield item
def parse_item(self, response): item = DongguanItem() #标题 item["title"] = response.xpath( '//div[@class="pagecenter p3"]//strong//text()').extract()[0] #编号 item["number"] = item["title"].split('.')[-1].split(":")[-1] #内容(返回是一个列表) 如果有内用,则返回列表,无无内容则返回为空列表 content = response.xpath('//div[@class="contentext"]/text()').extract() #如果没内用,则是使用无图片的的匹配的规则 (有图片与无图片的是有区别的) if len(content) == 0: content = response.xpath( '//div[@class="c1 text14_2"]/text()').extract() item["content"] = "".join(content).strip() else: item["content"] = "".join(content).strip() #链接 item["url"] = response.url yield item